Files
jobtrackingapp/JobTrackerApi/Controllers/ProfileCvController.cs
T

1052 lines
48 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using JobTrackerApi.Data;
using JobTrackerApi.Services;
using JobTrackerApi.Models;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Identity;
using Microsoft.AspNetCore.Mvc;
using Microsoft.EntityFrameworkCore;
namespace JobTrackerApi.Controllers;
[ApiController]
[Route("api/profile-cv")]
[Authorize(AuthenticationSchemes = "local")]
public sealed class ProfileCvController : ControllerBase
{
private static readonly HashSet<string> AllowedExtensions = new(StringComparer.OrdinalIgnoreCase)
{
".txt",
".md",
".pdf",
".docx",
".png",
".jpg",
".jpeg",
".webp",
};
private static readonly Dictionary<string, string> SectionAliases = new(StringComparer.OrdinalIgnoreCase)
{
["professional summary"] = "Professional Summary",
["summary"] = "Professional Summary",
["profile"] = "Professional Summary",
["about me"] = "Professional Summary",
["contact"] = "Contact",
["contact details"] = "Contact",
["core skills"] = "Skills",
["skills"] = "Skills",
["technical skills"] = "Skills",
["experience"] = "Work Experience",
["experience highlights"] = "Work Experience",
["work experience"] = "Work Experience",
["employment history"] = "Work Experience",
["selected achievements"] = "Selected Achievements",
["achievements"] = "Selected Achievements",
["projects"] = "Projects",
["education"] = "Education",
["certifications"] = "Certifications",
["certificates"] = "Certifications",
["languages"] = "Languages",
["interests"] = "Interests",
};
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
private const string ParserVersion = "m005-s01";
private const string NormalizerVersion = "m005-s01";
private const string LlmPromptVersion = "m005-s01";
private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
private readonly ICvAiClassifier _cvAiClassifier;
private readonly JobTrackerContext _db;
private readonly AppPaths _paths;
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null)
{
_users = users;
_aiService = aiService;
_cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance;
_db = db;
_paths = paths;
}
public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole);
public sealed record ParseCvRequest(string? Text);
private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv);
public sealed record CvExtractionRunListItem(
int Id,
string Trigger,
string Status,
string? ArtifactFileName,
DateTimeOffset StartedAtUtc,
DateTimeOffset? CompletedAtUtc,
DateTimeOffset? AppliedAtUtc,
string ParserVersion,
string NormalizerVersion,
string LlmPromptVersion,
string? ErrorMessage);
[HttpPost("upload")]
[RequestSizeLimit(MaxFileSizeBytes)]
public async Task<IActionResult> Upload([FromForm] IFormFile file)
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (file is null || file.Length == 0) return BadRequest("Select a CV file to upload.");
if (file.Length > MaxFileSizeBytes) return BadRequest("CV import file is too large. Keep it under 5 MB.");
var extension = Path.GetExtension(file.FileName ?? string.Empty);
if (!AllowedExtensions.Contains(extension))
{
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
}
var artifact = await SaveUploadArtifactAsync(user, file, HttpContext.RequestAborted);
_db.CvUploadArtifacts.Add(artifact);
await _db.SaveChangesAsync(HttpContext.RequestAborted);
var run = new CvExtractionRun
{
OwnerUserId = user.Id,
ArtifactId = artifact.Id,
Trigger = "upload",
ParserVersion = ParserVersion,
NormalizerVersion = NormalizerVersion,
LlmPromptVersion = LlmPromptVersion,
Status = "running",
StartedAtUtc = DateTimeOffset.UtcNow,
};
_db.CvExtractionRuns.Add(run);
await _db.SaveChangesAsync(HttpContext.RequestAborted);
try
{
var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted);
result.StructuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
result.StructuredCv.Metadata.AppliedExtractionRunId = run.Id;
result.StructuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
var structuredJson = StructuredCvProfileJson.Serialize(result.StructuredCv);
run.RawExtractedText = result.RawText;
run.NormalizedText = result.NormalizedText;
run.StructuredProfileJson = structuredJson;
run.Status = "applied";
run.CompletedAtUtc = DateTimeOffset.UtcNow;
run.AppliedAtUtc = run.CompletedAtUtc;
user.ProfileCvText = result.NormalizedText;
user.ProfileCvStructureJson = structuredJson;
user.CurrentCvUploadArtifactId = artifact.Id;
user.CurrentCvExtractionRunId = run.Id;
user.CurrentCvProfileVersion = result.StructuredCv.Metadata.ProfileVersion;
var update = await _users.UpdateAsync(user);
if (!update.Succeeded)
{
run.Status = "failed";
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
await _db.SaveChangesAsync(HttpContext.RequestAborted);
return BadRequest(run.ErrorMessage);
}
await _db.SaveChangesAsync(HttpContext.RequestAborted);
return Ok(new
{
imported = true,
characters = result.NormalizedText.Length,
structuredCv = result.StructuredCv,
sections = result.StructuredCv.Sections,
artifactId = artifact.Id,
extractionRunId = run.Id,
profileVersion = result.StructuredCv.Metadata.ProfileVersion,
});
}
catch (Exception ex)
{
run.Status = "failed";
run.ErrorMessage = ex.Message;
run.CompletedAtUtc = DateTimeOffset.UtcNow;
await _db.SaveChangesAsync(HttpContext.RequestAborted);
throw;
}
}
[HttpGet("runs")]
public async Task<ActionResult<IEnumerable<CvExtractionRunListItem>>> GetRuns()
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
var runs = await _db.CvExtractionRuns
.AsNoTracking()
.Where(x => x.OwnerUserId == user.Id)
.OrderByDescending(x => x.StartedAtUtc)
.Take(10)
.Select(x => new CvExtractionRunListItem(
x.Id,
x.Trigger,
x.Status,
x.Artifact != null ? x.Artifact.OriginalFileName : null,
x.StartedAtUtc,
x.CompletedAtUtc,
x.AppliedAtUtc,
x.ParserVersion,
x.NormalizerVersion,
x.LlmPromptVersion,
x.ErrorMessage))
.ToListAsync(HttpContext.RequestAborted);
return Ok(runs);
}
[HttpPost("reprocess")]
public async Task<IActionResult> Reprocess()
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
var artifact = await _db.CvUploadArtifacts
.OrderByDescending(x => x.UploadedAtUtc)
.FirstOrDefaultAsync(x => x.OwnerUserId == user.Id, HttpContext.RequestAborted);
if (artifact is null) return BadRequest("Upload a CV before reprocessing it.");
if (string.IsNullOrWhiteSpace(artifact.StoragePath) || !System.IO.File.Exists(artifact.StoragePath))
{
return BadRequest("The stored CV artifact could not be found for reprocessing.");
}
await using var stream = System.IO.File.OpenRead(artifact.StoragePath);
var file = new FormFile(stream, 0, stream.Length, "file", artifact.OriginalFileName)
{
Headers = new HeaderDictionary(),
ContentType = artifact.MimeType
};
var extension = Path.GetExtension(artifact.OriginalFileName ?? string.Empty);
var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted);
await ApplyTextExtractionRunAsync(user, "reprocess", result.RawText, result.NormalizedText, result.StructuredCv, artifact.Id, HttpContext.RequestAborted);
return Ok(new
{
reprocessed = true,
artifactId = artifact.Id,
extractionRunId = user.CurrentCvExtractionRunId,
profileVersion = user.CurrentCvProfileVersion,
structuredCv = result.StructuredCv,
sections = result.StructuredCv.Sections,
});
}
[HttpPost("rebuild")]
public async Task<IActionResult> Rebuild()
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before rebuilding it.");
var rebuilt = await _aiService.SummarizeSectionAsync(
"Rewrite this CV into a stronger master CV with clear sections such as Professional Summary, Core Skills, Experience Highlights, and Selected Achievements. Preserve only factual claims, avoid inventing employers or metrics, and make the output clean and ready for tailoring to job applications. Return only the rebuilt CV text.",
user.ProfileCvText,
2200,
700);
if (string.IsNullOrWhiteSpace(rebuilt))
{
return BadRequest("The AI service could not rebuild your CV text right now.");
}
user.ProfileCvText = rebuilt.Trim();
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
await ApplyTextExtractionRunAsync(user, "rebuild", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
}
[HttpPost("rewrite-section")]
public async Task<IActionResult> RewriteSection([FromBody] RewriteSectionRequest request)
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before rewriting a section.");
var sectionName = string.IsNullOrWhiteSpace(request.SectionName) ? "Professional Summary" : request.SectionName.Trim();
var style = string.IsNullOrWhiteSpace(request.Style) ? "balanced" : request.Style.Trim();
var targetRole = string.IsNullOrWhiteSpace(request.TargetRole) ? null : request.TargetRole.Trim();
var rewritten = await _aiService.SummarizeSectionAsync(
$"Rewrite only the '{sectionName}' section of this CV. Preserve facts, avoid inventing employers or metrics, and output only the rewritten section text. Style: {style}. {(targetRole is not null ? $"Target role: {targetRole}." : "Make it broadly reusable for future tailoring.")}",
user.ProfileCvText,
900,
180);
if (string.IsNullOrWhiteSpace(rewritten))
{
return BadRequest("The AI service could not rewrite that CV section right now.");
}
return Ok(new { sectionName, style, targetRole, text = rewritten.Trim() });
}
[HttpPost("parse")]
public async Task<ActionResult<object>> Parse([FromBody] ParseCvRequest? request)
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
var source = string.IsNullOrWhiteSpace(request?.Text) ? user.ProfileCvText : request!.Text;
if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections.");
var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted);
if (string.IsNullOrWhiteSpace(request?.Text))
{
user.ProfileCvText = source;
}
await ApplyTextExtractionRunAsync(user, "parse", source, source, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source), extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
}
[HttpPost("improve")]
public async Task<IActionResult> Improve()
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before improving it.");
var improved = await _aiService.SummarizeSectionAsync(
"Rewrite this CV into a cleaner, better-structured master CV profile. Preserve factual claims, employers, skills, and measurable results. Improve clarity, tighten wording, use strong bullet-style phrasing, and keep it ready for further tailoring to specific roles. Return only the improved CV text.",
user.ProfileCvText,
1800,
500);
if (string.IsNullOrWhiteSpace(improved))
{
return BadRequest("The AI service could not improve your CV text right now.");
}
user.ProfileCvText = improved.Trim();
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
await ApplyTextExtractionRunAsync(user, "improve", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
}
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var parseSource = NormalizeTextForStructuredParsing(text);
var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
AnnotateStructuredCv(sectionFallback, "repair", 0.56);
var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text);
AnnotateStructuredCv(heuristicFallback, "deterministic", 0.68);
heuristicFallback.Sections = new List<StructuredCvSection>();
var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email);
return StructuredCvProfileJson.Normalize(merged);
}
private async Task<CvUploadArtifact> SaveUploadArtifactAsync(ApplicationUser user, IFormFile file, CancellationToken cancellationToken)
{
var extension = Path.GetExtension(file.FileName ?? string.Empty);
var userRoot = Path.Combine(_paths.CvArtifactsRoot, user.Id);
Directory.CreateDirectory(userRoot);
var storedFileName = $"{DateTimeOffset.UtcNow:yyyyMMddHHmmss}-{Guid.NewGuid():N}{extension}";
var storagePath = Path.Combine(userRoot, storedFileName);
await using (var target = System.IO.File.Create(storagePath))
await using (var source = file.OpenReadStream())
{
await source.CopyToAsync(target, cancellationToken);
}
await using var hashStream = System.IO.File.OpenRead(storagePath);
var shaBytes = await SHA256.HashDataAsync(hashStream, cancellationToken);
return new CvUploadArtifact
{
OwnerUserId = user.Id,
OriginalFileName = file.FileName ?? storedFileName,
StoredFileName = storedFileName,
MimeType = file.ContentType ?? "application/octet-stream",
ByteSize = file.Length,
Sha256 = Convert.ToHexString(shaBytes),
StoragePath = storagePath,
UploadedAtUtc = DateTimeOffset.UtcNow,
};
}
private async Task<ExtractionPipelineResult> ExtractStructuredCvFromFileAsync(IFormFile file, string extension, CancellationToken cancellationToken)
{
string text;
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
if (canUseAiExtraction)
{
await using var uploadStream = file.OpenReadStream();
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, cancellationToken);
text = extracted?.Text?.Trim() ?? string.Empty;
}
else
{
text = string.Empty;
}
if (string.IsNullOrWhiteSpace(text))
{
text = (await ExtractTextAsync(file, extension)).Trim();
}
if (string.IsNullOrWhiteSpace(text))
{
throw new InvalidOperationException("The uploaded CV file could not be read or was empty.");
}
var normalizedText = (await MaybeReconstructStructuredCvAsync(text, cancellationToken)).Trim();
var structuredCv = await BuildStructuredCvAsync(normalizedText, cancellationToken);
return new ExtractionPipelineResult(text, normalizedText, structuredCv);
}
private async Task ApplyTextExtractionRunAsync(ApplicationUser user, string trigger, string rawText, string normalizedText, StructuredCvProfile structuredCv, int? artifactId, CancellationToken cancellationToken)
{
var run = new CvExtractionRun
{
OwnerUserId = user.Id,
ArtifactId = artifactId,
Trigger = trigger,
ParserVersion = ParserVersion,
NormalizerVersion = NormalizerVersion,
LlmPromptVersion = LlmPromptVersion,
Status = "applied",
RawExtractedText = rawText,
NormalizedText = normalizedText,
StartedAtUtc = DateTimeOffset.UtcNow,
CompletedAtUtc = DateTimeOffset.UtcNow,
AppliedAtUtc = DateTimeOffset.UtcNow,
};
_db.CvExtractionRuns.Add(run);
await _db.SaveChangesAsync(cancellationToken);
structuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
structuredCv.Metadata.AppliedExtractionRunId = run.Id;
structuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
var structuredJson = StructuredCvProfileJson.Serialize(structuredCv);
run.StructuredProfileJson = structuredJson;
user.ProfileCvText = normalizedText;
user.ProfileCvStructureJson = structuredJson;
user.CurrentCvExtractionRunId = run.Id;
user.CurrentCvProfileVersion = structuredCv.Metadata.ProfileVersion;
if (artifactId.HasValue)
{
user.CurrentCvUploadArtifactId = artifactId.Value;
}
var update = await _users.UpdateAsync(user);
if (!update.Succeeded)
{
run.Status = "failed";
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
await _db.SaveChangesAsync(cancellationToken);
throw new InvalidOperationException(run.ErrorMessage);
}
await _db.SaveChangesAsync(cancellationToken);
}
private static void AnnotateStructuredCv(StructuredCvProfile profile, string method, double confidence)
{
var now = DateTimeOffset.UtcNow;
profile.Metadata ??= new StructuredCvMetadata();
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
void SetIf(string key, string? value)
{
if (string.IsNullOrWhiteSpace(value)) return;
profile.Metadata.Fields[key] = new StructuredCvFieldMetadata
{
Confidence = confidence,
Method = method,
SourceSnippet = value.Length > 180 ? value[..180] : value,
ReviewState = "suggested",
LastUpdatedAtUtc = now,
};
}
SetIf("contact.fullName", profile.Contact.FullName);
SetIf("contact.headline", profile.Contact.Headline);
SetIf("contact.email", profile.Contact.Email);
SetIf("contact.phone", profile.Contact.Phone);
SetIf("contact.location", profile.Contact.Location);
SetIf("contact.website", profile.Contact.Website);
SetIf("contact.linkedIn", profile.Contact.LinkedIn);
SetIf("summary", profile.Summary.FirstOrDefault());
SetIf("skills", profile.Skills.FirstOrDefault());
SetIf("languages", profile.Languages.FirstOrDefault()?.Name);
SetIf("interests", profile.Interests.FirstOrDefault());
SetIf("jobs", profile.Jobs.FirstOrDefault()?.Title ?? profile.Jobs.FirstOrDefault()?.Company);
SetIf("education", profile.Education.FirstOrDefault()?.Qualification ?? profile.Education.FirstOrDefault()?.Institution);
}
private async Task<StructuredCvProfile?> TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var structuredJson = await _aiService.SummarizeSectionAsync(
"Extract this CV into structured JSON. Return only valid JSON with this exact top-level shape: { \"version\": \"1\", \"contact\": { \"fullName\": string|null, \"headline\": string|null, \"email\": string|null, \"phone\": string|null, \"location\": string|null, \"website\": string|null, \"linkedin\": string|null }, \"summary\": string[], \"jobs\": [{ \"title\": string|null, \"company\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"isCurrent\": boolean, \"bullets\": string[], \"skills\": string[] }], \"education\": [{ \"qualification\": string|null, \"institution\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"details\": string[] }], \"skills\": string[], \"languages\": [{ \"name\": string|null, \"level\": string|null, \"notes\": string|null }], \"interests\": string[], \"otherSections\": [{ \"title\": string|null, \"items\": string[] }] }. Preserve facts only. Do not invent anything. If a field is unknown, use null or an empty array. Keep wording close to the source. Put unmatched content in otherSections.",
text,
3200,
900);
if (string.IsNullOrWhiteSpace(structuredJson)) return null;
var extracted = ExtractJsonObject(structuredJson);
if (string.IsNullOrWhiteSpace(extracted)) return null;
var parsed = StructuredCvProfileJson.Deserialize(extracted);
if (!IsMeaningfullyStructured(parsed)) return null;
AnnotateStructuredCv(parsed, "llm", 0.82);
return parsed;
}
private static bool IsMeaningfullyStructured(StructuredCvProfile profile)
{
return !string.IsNullOrWhiteSpace(profile.Contact.FullName)
|| profile.Summary.Count > 0
|| profile.Jobs.Count > 0
|| profile.Education.Count > 0
|| profile.Skills.Count > 0
|| profile.Languages.Count > 0
|| profile.Interests.Count > 0
|| profile.OtherSections.Count > 0;
}
private static string? ExtractJsonObject(string raw)
{
var trimmed = raw.Trim();
if (trimmed.StartsWith("```", StringComparison.Ordinal))
{
trimmed = Regex.Replace(trimmed, "^```(?:json)?\\s*|\\s*```$", string.Empty, RegexOptions.IgnoreCase);
}
var start = trimmed.IndexOf('{');
var end = trimmed.LastIndexOf('}');
if (start < 0 || end <= start) return null;
return trimmed[start..(end + 1)];
}
private static string? GuessFullName(string source)
{
var normalized = source.Replace("\r\n", "\n");
foreach (var line in normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(6))
{
var cleaned = line.Trim().TrimStart('#').Trim();
if (cleaned.Length < 4 || cleaned.Length > 80) continue;
if (cleaned.Contains('@') || Regex.IsMatch(cleaned, @"\d")) continue;
if (!Regex.IsMatch(cleaned, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,4}$")) continue;
return cleaned;
}
return null;
}
private static string? GuessFullNameFromEmail(string? email)
{
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
var localPart = email[..email.IndexOf('@')].Trim();
if (string.IsNullOrWhiteSpace(localPart)) return null;
var parts = Regex.Split(localPart, @"[._-]+")
.Select(part => part.Trim())
.Where(part => part.Length > 0)
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
.ToList();
return parts.Count >= 2 ? string.Join(" ", parts) : null;
}
private static string NormalizeTextForStructuredParsing(string source)
{
if (string.IsNullOrWhiteSpace(source)) return string.Empty;
var text = source.Replace("\r\n", "\n").Trim();
if (!LooksLikeFlattenedCvExtraction(text)) return text;
text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match =>
{
var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty);
foreach (var alias in SectionAliases)
{
var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty);
if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase))
{
return $"\n\n## {alias.Value}\n";
}
}
return match.Value;
});
foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length))
{
text = Regex.Replace(
text,
$@"(?<!#)\b{Regex.Escape(alias.Key)}\b",
$"\n\n## {alias.Value}\n",
RegexOptions.IgnoreCase);
}
text = Regex.Replace(text, @"\s+\+\s+", "\n+ ");
text = Regex.Replace(text, @"\s*([•●▪◦])\s*", "\n- ");
text = Regex.Replace(text, @"\s+(\d{4}\s*[-]\s*(?:\d{4}|Present|Current))\b", "\n$1\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"\n{3,}", "\n\n");
return text.Trim();
}
private static StructuredCvProfile BuildHeuristicStructuredCv(string parseSource, string rawSource)
{
var profile = new StructuredCvProfile();
var normalized = parseSource.Replace("\r\n", "\n").Trim();
profile.Contact.Email = NullIfWhitespace(Regex.Match(rawSource, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value);
profile.Contact.Phone = NullIfWhitespace(Regex.Match(rawSource, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value);
profile.Contact.Website = NullIfWhitespace(Regex.Match(rawSource, @"\b(?:https?://)?(?:www\.)?[A-Z0-9.-]+\.[A-Z]{2,}(?:/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]*)?", RegexOptions.IgnoreCase).Value);
profile.Contact.LinkedIn = NullIfWhitespace(Regex.Match(rawSource, @"(?:linkedin(?:\.com)?/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]+)", RegexOptions.IgnoreCase).Value);
profile.Contact.FullName = GuessFullName(rawSource) ?? GuessFullNameFromEmail(profile.Contact.Email);
profile.Contact.Location = NullIfWhitespace(Regex.Match(rawSource, @"\b[A-Z][a-z]+(?:[\s-][A-Z][a-z]+)*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b").Value);
var sections = ParseSections(normalized);
var summarySection = sections.FirstOrDefault(section => section.Name == "Professional Summary");
var flattenedSummary = Regex.Match(
rawSource,
@"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?<body>.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (flattenedSummary.Success)
{
profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5);
}
else if (!string.IsNullOrWhiteSpace(summarySection.Content))
{
profile.Summary = SplitSentences(summarySection.Content, 5);
}
var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests");
if (!string.IsNullOrWhiteSpace(interestsSection.Content))
{
profile.Interests = SplitListLike(interestsSection.Content);
}
else
{
var flattenedInterests = Regex.Match(
rawSource,
@"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?<body>.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (flattenedInterests.Success)
{
profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4);
}
}
var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages");
if (!string.IsNullOrWhiteSpace(languagesSection.Content))
{
profile.Languages = ParseLanguagesHeuristically(languagesSection.Content);
}
else
{
profile.Languages = ParseLanguagesHeuristically(rawSource);
}
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in Regex.Matches(rawSource, @"(?<![A-Za-z0-9])(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)(?![A-Za-z0-9])", RegexOptions.IgnoreCase))
{
skills.Add(match.Value.Trim());
}
profile.Skills = skills.ToList();
var educationSection = sections.FirstOrDefault(section => section.Name == "Education");
if (!string.IsNullOrWhiteSpace(educationSection.Content))
{
profile.Education = ParseEducationHeuristically(educationSection.Content);
}
var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience");
if (!string.IsNullOrWhiteSpace(experienceSection.Content))
{
profile.Jobs = ParseJobsHeuristically(experienceSection.Content);
}
if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General"))
{
var general = sections.First(section => section.Name == "General");
if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0)
{
profile.Summary = SplitSentences(general.Content, 3);
}
}
return StructuredCvProfileJson.Normalize(profile);
}
private static List<string> SplitSentences(string content, int limit)
{
return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+")
.Select(value => value.Trim())
.Where(value => value.Length > 20)
.Take(limit)
.ToList();
}
private static List<string> SplitListLike(string content)
{
return content
.Replace("\r\n", "\n")
.Split(new[] { '\n', ',', ';' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
.Where(item => item.Length > 1)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
{
var languages = new List<StructuredCvLanguage>();
var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+")
.Select(item => item.Trim())
.Where(item => item.Length > 1);
foreach (var candidate in candidates)
{
var level = HumanLanguageCatalog.ExtractLevel(candidate);
if (level is null) continue;
foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate))
{
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
}
}
return languages
.GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase)
.Select(group => group.First())
.ToList();
}
private static List<StructuredCvEducation> ParseEducationHeuristically(string content)
{
var blocks = Regex.Split(content, @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length > 0)
.ToList();
var items = new List<StructuredCvEducation>();
foreach (var block in blocks)
{
var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) continue;
var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase);
var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' ');
var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-]"));
if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0];
if (qualificationLine is null && institutionLine is null) continue;
items.Add(new StructuredCvEducation
{
Qualification = TitleCasePreservingAcronyms(qualificationLine),
Institution = TitleCasePreservingAcronyms(institutionLine),
Start = dateMatch.Success ? dateMatch.Groups[1].Value : null,
End = dateMatch.Success ? dateMatch.Groups[2].Value : null,
Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(),
});
}
return items;
}
private static List<StructuredCvJob> ParseJobsHeuristically(string content)
{
var normalized = content.Replace("\r\n", "\n");
var pattern = new Regex(@"(?<title>[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline);
var jobs = new List<StructuredCvJob>();
foreach (Match match in pattern.Matches(normalized))
{
var body = match.Groups["body"].Value.Trim();
var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value);
var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-]\s*");
var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6);
jobs.Add(new StructuredCvJob
{
Title = TitleCasePreservingAcronyms(match.Groups["title"].Value),
Company = employer,
Start = NullIfWhitespace(dates.FirstOrDefault()),
End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()),
IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase),
Bullets = bullets,
Skills = bullets.SelectMany(SplitListLike).Where(item => Regex.IsMatch(item, @"^(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)$", RegexOptions.IgnoreCase)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(),
});
}
return jobs;
}
private static string? TitleCasePreservingAcronyms(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
var words = value.Trim()
.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
? word
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())
.ToArray();
return string.Join(" ", words);
}
private static int CountWords(string? text)
{
if (string.IsNullOrWhiteSpace(text)) return 0;
return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
}
private static string? NullIfWhitespace(string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
private static List<(string Name, string Content)> ParseSections(string source)
{
var lines = source.Replace("\r\n", "\n").Split('\n');
var sections = new List<(string Name, List<string> Lines)>();
var currentName = "General";
var currentLines = new List<string>();
void Flush()
{
var content = string.Join("\n", currentLines).Trim();
if (!string.IsNullOrWhiteSpace(content))
{
sections.Add((currentName, new List<string>(currentLines)));
}
currentLines.Clear();
}
foreach (var raw in lines)
{
var line = raw.Trim();
var canonicalHeading = CanonicalizeSectionHeading(line);
if (canonicalHeading is not null)
{
Flush();
currentName = canonicalHeading;
continue;
}
currentLines.Add(raw);
}
Flush();
if (sections.Count == 0)
{
return new List<(string Name, string Content)> { ("General", source.Trim()) };
}
return sections
.Select(section => (section.Name, string.Join("\n", section.Lines).Trim()))
.Where(section => !string.IsNullOrWhiteSpace(section.Item2))
.ToList();
}
private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var parsed = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
if (hasRealSections) return parsed;
var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
return aiSections.Count > 0 ? aiSections : parsed;
}
private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length >= 24)
.ToList();
if (blocks.Count == 0) return new List<StructuredCvSection>();
var sectionBuckets = new List<StructuredCvSection>();
foreach (var block in blocks)
{
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
var sectionName = classification?.Section;
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
{
sectionName = canonical;
}
if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase))
{
sectionName = "General";
}
var content = block;
if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null)
{
var lines = new List<string>();
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase);
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent);
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
if (classification.Bullets is not null)
{
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
}
if (lines.Count > 0) content = string.Join("\n", lines);
}
var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
if (existing is null)
{
sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
}
else
{
existing.Content = $"{existing.Content}\n\n{content}".Trim();
existing.WordCount = CountWords(existing.Content);
}
}
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
}
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
{
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
if (string.IsNullOrWhiteSpace(start)) return end;
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
}
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var normalized = text.Trim();
if (!LooksLikeFlattenedCvExtraction(normalized))
{
return normalized;
}
var reconstructed = await _aiService.SummarizeSectionAsync(
"Reconstruct this CV text extracted from a PDF into a clean, readable master CV in markdown. Preserve facts only. Recover clear sections such as Contact, Professional Summary, Work Experience, Education, Skills, Languages, and Interests when present. Split contact details onto their own lines, turn noisy all-caps/spaced headings into normal headings, keep dates with the correct roles and employers, and remove layout/OCR artifacts. Do not invent employers, titles, dates, or metrics. Return only the reconstructed CV text.",
normalized,
2800,
900);
return string.IsNullOrWhiteSpace(reconstructed) ? normalized : reconstructed.Trim();
}
private static bool LooksLikeFlattenedCvExtraction(string text)
{
if (string.IsNullOrWhiteSpace(text)) return false;
var normalized = text.Replace("\r\n", "\n");
var lineCount = normalized.Split('\n').Count(line => !string.IsNullOrWhiteSpace(line));
var spacedHeadingCount = Regex.Matches(normalized, @"\b(?:[A-Z]\s){3,}[A-Z]\b").Count;
var knownHeadingHits = SectionAliases.Keys.Count(alias => normalized.Contains(alias, StringComparison.OrdinalIgnoreCase));
var bulletCount = Regex.Matches(normalized, @"[•●▪◦]").Count;
return (lineCount <= 6 && normalized.Length >= 500)
|| spacedHeadingCount >= 3
|| (knownHeadingHits >= 3 && lineCount <= 12)
|| (normalized.Contains(" + ") && bulletCount > 0 && lineCount <= 10);
}
private static string? CanonicalizeSectionHeading(string line)
{
if (string.IsNullOrWhiteSpace(line)) return null;
var normalized = line.Trim();
if (normalized.StartsWith("#", StringComparison.Ordinal))
{
normalized = normalized.TrimStart('#').Trim();
}
normalized = normalized.TrimEnd(':').Trim();
if (normalized.Length == 0 || normalized.Length > 60) return null;
if (normalized.Contains('.') || normalized.Contains(" ")) return null;
return SectionAliases.TryGetValue(normalized, out var canonical) ? canonical : null;
}
private static async Task<string> ExtractTextAsync(IFormFile file, string extension)
{
if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase))
{
using var stream = file.OpenReadStream();
using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true);
return (await reader.ReadToEndAsync()).Trim();
}
await using var memory = new MemoryStream();
await file.CopyToAsync(memory);
var bytes = memory.ToArray();
if (string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase))
{
var raw = Encoding.UTF8.GetString(bytes);
var textMatches = Regex.Matches(raw, @"\((.*?)\)Tj", RegexOptions.Singleline)
.Select(match => match.Groups[1].Value)
.Concat(Regex.Matches(raw, @"\[(.*?)\]TJ", RegexOptions.Singleline)
.SelectMany(match => Regex.Matches(match.Groups[1].Value, @"\((.*?)\)", RegexOptions.Singleline).Select(x => x.Groups[1].Value)))
.Where(value => !string.IsNullOrWhiteSpace(value))
.Select(value => Regex.Unescape(value))
.ToList();
var joined = textMatches.Count > 0 ? string.Join(" ", textMatches) : raw;
var scrubbed = Regex.Replace(joined, @"[\x00-\x08\x0B\x0C\x0E-\x1F]", " ");
return Regex.Replace(scrubbed, @"\s+", " ").Trim();
}
if (string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase))
{
using var archive = new System.IO.Compression.ZipArchive(new MemoryStream(bytes), System.IO.Compression.ZipArchiveMode.Read, leaveOpen: false);
var entry = archive.GetEntry("word/document.xml");
if (entry is null) return string.Empty;
using var entryStream = entry.Open();
using var reader = new StreamReader(entryStream, Encoding.UTF8);
var xml = await reader.ReadToEndAsync();
var withoutTags = Regex.Replace(xml, "<[^>]+>", " ");
var decoded = System.Net.WebUtility.HtmlDecode(withoutTags) ?? string.Empty;
return Regex.Replace(decoded, @"\s+", " ").Trim();
}
return string.Empty;
}
}