Files
jobtrackingapp/JobTrackerApi/Controllers/ProfileCvController.cs
T

453 lines
20 KiB
C#

using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using JobTrackerApi.Services;
using JobTrackerApi.Models;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Identity;
using Microsoft.AspNetCore.Mvc;
namespace JobTrackerApi.Controllers;
[ApiController]
[Route("api/profile-cv")]
[Authorize(AuthenticationSchemes = "local")]
public sealed class ProfileCvController : ControllerBase
{
private static readonly HashSet<string> AllowedExtensions = new(StringComparer.OrdinalIgnoreCase)
{
".txt",
".md",
".pdf",
".docx",
".png",
".jpg",
".jpeg",
".webp",
};
private static readonly Dictionary<string, string> SectionAliases = new(StringComparer.OrdinalIgnoreCase)
{
["professional summary"] = "Professional Summary",
["summary"] = "Professional Summary",
["profile"] = "Professional Summary",
["about me"] = "Professional Summary",
["contact"] = "Contact",
["contact details"] = "Contact",
["core skills"] = "Skills",
["skills"] = "Skills",
["technical skills"] = "Skills",
["experience"] = "Work Experience",
["experience highlights"] = "Work Experience",
["work experience"] = "Work Experience",
["employment history"] = "Work Experience",
["selected achievements"] = "Selected Achievements",
["achievements"] = "Selected Achievements",
["projects"] = "Projects",
["education"] = "Education",
["certifications"] = "Certifications",
["certificates"] = "Certifications",
["languages"] = "Languages",
["interests"] = "Interests",
};
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService)
{
_users = users;
_aiService = aiService;
}
public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole);
public sealed record ParseCvRequest(string? Text);
[HttpPost("upload")]
[RequestSizeLimit(MaxFileSizeBytes)]
public async Task<IActionResult> Upload([FromForm] IFormFile file)
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (file is null || file.Length == 0) return BadRequest("Select a CV file to upload.");
if (file.Length > MaxFileSizeBytes) return BadRequest("CV import file is too large. Keep it under 5 MB.");
var extension = Path.GetExtension(file.FileName ?? string.Empty);
if (!AllowedExtensions.Contains(extension))
{
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
}
string text;
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
if (canUseAiExtraction)
{
await using var uploadStream = file.OpenReadStream();
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted);
text = extracted?.Text?.Trim() ?? string.Empty;
}
else
{
text = string.Empty;
}
if (string.IsNullOrWhiteSpace(text))
{
text = (await ExtractTextAsync(file, extension)).Trim();
}
if (string.IsNullOrWhiteSpace(text))
{
return BadRequest("The uploaded CV file could not be read or was empty.");
}
text = (await MaybeReconstructStructuredCvAsync(text, HttpContext.RequestAborted)).Trim();
var structuredCv = await BuildStructuredCvAsync(text, HttpContext.RequestAborted);
user.ProfileCvText = text;
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var result = await _users.UpdateAsync(user);
if (!result.Succeeded)
{
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
}
return Ok(new { imported = true, characters = text.Length, structuredCv, sections = structuredCv.Sections });
}
[HttpPost("rebuild")]
public async Task<IActionResult> Rebuild()
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before rebuilding it.");
var rebuilt = await _aiService.SummarizeSectionAsync(
"Rewrite this CV into a stronger master CV with clear sections such as Professional Summary, Core Skills, Experience Highlights, and Selected Achievements. Preserve only factual claims, avoid inventing employers or metrics, and make the output clean and ready for tailoring to job applications. Return only the rebuilt CV text.",
user.ProfileCvText,
2200,
700);
if (string.IsNullOrWhiteSpace(rebuilt))
{
return BadRequest("The AI service could not rebuild your CV text right now.");
}
user.ProfileCvText = rebuilt.Trim();
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var result = await _users.UpdateAsync(user);
if (!result.Succeeded)
{
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
}
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
}
[HttpPost("rewrite-section")]
public async Task<IActionResult> RewriteSection([FromBody] RewriteSectionRequest request)
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before rewriting a section.");
var sectionName = string.IsNullOrWhiteSpace(request.SectionName) ? "Professional Summary" : request.SectionName.Trim();
var style = string.IsNullOrWhiteSpace(request.Style) ? "balanced" : request.Style.Trim();
var targetRole = string.IsNullOrWhiteSpace(request.TargetRole) ? null : request.TargetRole.Trim();
var rewritten = await _aiService.SummarizeSectionAsync(
$"Rewrite only the '{sectionName}' section of this CV. Preserve facts, avoid inventing employers or metrics, and output only the rewritten section text. Style: {style}. {(targetRole is not null ? $"Target role: {targetRole}." : "Make it broadly reusable for future tailoring.")}",
user.ProfileCvText,
900,
180);
if (string.IsNullOrWhiteSpace(rewritten))
{
return BadRequest("The AI service could not rewrite that CV section right now.");
}
return Ok(new { sectionName, style, targetRole, text = rewritten.Trim() });
}
[HttpPost("parse")]
public async Task<ActionResult<object>> Parse([FromBody] ParseCvRequest? request)
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
var source = string.IsNullOrWhiteSpace(request?.Text) ? user.ProfileCvText : request!.Text;
if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections.");
var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted);
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var update = await _users.UpdateAsync(user);
if (!update.Succeeded)
{
return BadRequest(string.Join("; ", update.Errors.Select(e => e.Description)));
}
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source) });
}
[HttpPost("improve")]
public async Task<IActionResult> Improve()
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before improving it.");
var improved = await _aiService.SummarizeSectionAsync(
"Rewrite this CV into a cleaner, better-structured master CV profile. Preserve factual claims, employers, skills, and measurable results. Improve clarity, tighten wording, use strong bullet-style phrasing, and keep it ready for further tailoring to specific roles. Return only the improved CV text.",
user.ProfileCvText,
1800,
500);
if (string.IsNullOrWhiteSpace(improved))
{
return BadRequest("The AI service could not improve your CV text right now.");
}
user.ProfileCvText = improved.Trim();
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var result = await _users.UpdateAsync(user);
if (!result.Succeeded)
{
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
}
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
}
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var fallbackSections = ParseSections(text)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var fallback = StructuredCvProfileJson.FromSections(fallbackSections);
fallback.Contact.FullName ??= GuessFullName(text);
var extracted = await TryExtractStructuredCvAsync(text, cancellationToken);
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
merged.Contact.FullName ??= GuessFullName(text);
return StructuredCvProfileJson.Normalize(merged);
}
private async Task<StructuredCvProfile?> TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var structuredJson = await _aiService.SummarizeSectionAsync(
"Extract this CV into structured JSON. Return only valid JSON with this exact top-level shape: { \"version\": \"1\", \"contact\": { \"fullName\": string|null, \"headline\": string|null, \"email\": string|null, \"phone\": string|null, \"location\": string|null, \"website\": string|null, \"linkedin\": string|null }, \"summary\": string[], \"jobs\": [{ \"title\": string|null, \"company\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"isCurrent\": boolean, \"bullets\": string[], \"skills\": string[] }], \"education\": [{ \"qualification\": string|null, \"institution\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"details\": string[] }], \"skills\": string[], \"languages\": [{ \"name\": string|null, \"level\": string|null, \"notes\": string|null }], \"interests\": string[], \"otherSections\": [{ \"title\": string|null, \"items\": string[] }] }. Preserve facts only. Do not invent anything. If a field is unknown, use null or an empty array. Keep wording close to the source. Put unmatched content in otherSections.",
text,
3200,
900);
if (string.IsNullOrWhiteSpace(structuredJson)) return null;
var extracted = ExtractJsonObject(structuredJson);
if (string.IsNullOrWhiteSpace(extracted)) return null;
var parsed = StructuredCvProfileJson.Deserialize(extracted);
return IsMeaningfullyStructured(parsed) ? parsed : null;
}
private static bool IsMeaningfullyStructured(StructuredCvProfile profile)
{
return !string.IsNullOrWhiteSpace(profile.Contact.FullName)
|| profile.Summary.Count > 0
|| profile.Jobs.Count > 0
|| profile.Education.Count > 0
|| profile.Skills.Count > 0
|| profile.Languages.Count > 0
|| profile.Interests.Count > 0
|| profile.OtherSections.Count > 0;
}
private static string? ExtractJsonObject(string raw)
{
var trimmed = raw.Trim();
if (trimmed.StartsWith("```", StringComparison.Ordinal))
{
trimmed = Regex.Replace(trimmed, "^```(?:json)?\\s*|\\s*```$", string.Empty, RegexOptions.IgnoreCase);
}
var start = trimmed.IndexOf('{');
var end = trimmed.LastIndexOf('}');
if (start < 0 || end <= start) return null;
return trimmed[start..(end + 1)];
}
private static string? GuessFullName(string source)
{
var normalized = source.Replace("\r\n", "\n");
foreach (var line in normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(6))
{
var cleaned = line.Trim().TrimStart('#').Trim();
if (cleaned.Length < 4 || cleaned.Length > 80) continue;
if (cleaned.Contains('@') || Regex.IsMatch(cleaned, @"\d")) continue;
if (!Regex.IsMatch(cleaned, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,4}$")) continue;
return cleaned;
}
return null;
}
private static int CountWords(string? text)
{
if (string.IsNullOrWhiteSpace(text)) return 0;
return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
}
private static List<(string Name, string Content)> ParseSections(string source)
{
var lines = source.Replace("\r\n", "\n").Split('\n');
var sections = new List<(string Name, List<string> Lines)>();
var currentName = "General";
var currentLines = new List<string>();
void Flush()
{
var content = string.Join("\n", currentLines).Trim();
if (!string.IsNullOrWhiteSpace(content))
{
sections.Add((currentName, new List<string>(currentLines)));
}
currentLines.Clear();
}
foreach (var raw in lines)
{
var line = raw.Trim();
var canonicalHeading = CanonicalizeSectionHeading(line);
if (canonicalHeading is not null)
{
Flush();
currentName = canonicalHeading;
continue;
}
currentLines.Add(raw);
}
Flush();
if (sections.Count == 0)
{
return new List<(string Name, string Content)> { ("General", source.Trim()) };
}
return sections
.Select(section => (section.Name, string.Join("\n", section.Lines).Trim()))
.Where(section => !string.IsNullOrWhiteSpace(section.Item2))
.ToList();
}
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var normalized = text.Trim();
if (!LooksLikeFlattenedCvExtraction(normalized))
{
return normalized;
}
var reconstructed = await _aiService.SummarizeSectionAsync(
"Reconstruct this CV text extracted from a PDF into a clean, readable master CV in markdown. Preserve facts only. Recover clear sections such as Contact, Professional Summary, Work Experience, Education, Skills, Languages, and Interests when present. Split contact details onto their own lines, turn noisy all-caps/spaced headings into normal headings, keep dates with the correct roles and employers, and remove layout/OCR artifacts. Do not invent employers, titles, dates, or metrics. Return only the reconstructed CV text.",
normalized,
2800,
900);
return string.IsNullOrWhiteSpace(reconstructed) ? normalized : reconstructed.Trim();
}
private static bool LooksLikeFlattenedCvExtraction(string text)
{
if (string.IsNullOrWhiteSpace(text)) return false;
var normalized = text.Replace("\r\n", "\n");
var lineCount = normalized.Split('\n').Count(line => !string.IsNullOrWhiteSpace(line));
var spacedHeadingCount = Regex.Matches(normalized, @"\b(?:[A-Z]\s){3,}[A-Z]\b").Count;
var knownHeadingHits = SectionAliases.Keys.Count(alias => normalized.Contains(alias, StringComparison.OrdinalIgnoreCase));
var bulletCount = Regex.Matches(normalized, @"[•●▪◦]").Count;
return (lineCount <= 6 && normalized.Length >= 500)
|| spacedHeadingCount >= 3
|| (knownHeadingHits >= 3 && lineCount <= 12)
|| (normalized.Contains(" + ") && bulletCount > 0 && lineCount <= 10);
}
private static string? CanonicalizeSectionHeading(string line)
{
if (string.IsNullOrWhiteSpace(line)) return null;
var normalized = line.Trim();
if (normalized.StartsWith("#", StringComparison.Ordinal))
{
normalized = normalized.TrimStart('#').Trim();
}
normalized = normalized.TrimEnd(':').Trim();
if (normalized.Length == 0 || normalized.Length > 60) return null;
if (normalized.Contains('.') || normalized.Contains(" ")) return null;
return SectionAliases.TryGetValue(normalized, out var canonical) ? canonical : null;
}
private static async Task<string> ExtractTextAsync(IFormFile file, string extension)
{
if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase))
{
using var stream = file.OpenReadStream();
using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true);
return (await reader.ReadToEndAsync()).Trim();
}
await using var memory = new MemoryStream();
await file.CopyToAsync(memory);
var bytes = memory.ToArray();
if (string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase))
{
var raw = Encoding.UTF8.GetString(bytes);
var textMatches = Regex.Matches(raw, @"\((.*?)\)Tj", RegexOptions.Singleline)
.Select(match => match.Groups[1].Value)
.Concat(Regex.Matches(raw, @"\[(.*?)\]TJ", RegexOptions.Singleline)
.SelectMany(match => Regex.Matches(match.Groups[1].Value, @"\((.*?)\)", RegexOptions.Singleline).Select(x => x.Groups[1].Value)))
.Where(value => !string.IsNullOrWhiteSpace(value))
.Select(value => Regex.Unescape(value))
.ToList();
var joined = textMatches.Count > 0 ? string.Join(" ", textMatches) : raw;
var scrubbed = Regex.Replace(joined, @"[\x00-\x08\x0B\x0C\x0E-\x1F]", " ");
return Regex.Replace(scrubbed, @"\s+", " ").Trim();
}
if (string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase))
{
using var archive = new System.IO.Compression.ZipArchive(new MemoryStream(bytes), System.IO.Compression.ZipArchiveMode.Read, leaveOpen: false);
var entry = archive.GetEntry("word/document.xml");
if (entry is null) return string.Empty;
using var entryStream = entry.Open();
using var reader = new StreamReader(entryStream, Encoding.UTF8);
var xml = await reader.ReadToEndAsync();
var withoutTags = Regex.Replace(xml, "<[^>]+>", " ");
var decoded = System.Net.WebUtility.HtmlDecode(withoutTags) ?? string.Empty;
return Regex.Replace(decoded, @"\s+", " ").Trim();
}
return string.Empty;
}
}