2262 lines
113 KiB
C#
2262 lines
113 KiB
C#
using System.Security.Cryptography;
|
||
using System.Text;
|
||
using System.Text.Json;
|
||
using System.Text.RegularExpressions;
|
||
using JobTrackerApi.Data;
|
||
using JobTrackerApi.Services;
|
||
using JobTrackerApi.Models;
|
||
using Microsoft.AspNetCore.Authorization;
|
||
using Microsoft.AspNetCore.Identity;
|
||
using Microsoft.AspNetCore.Mvc;
|
||
using Microsoft.EntityFrameworkCore;
|
||
|
||
namespace JobTrackerApi.Controllers;
|
||
|
||
[ApiController]
|
||
[Route("api/profile-cv")]
|
||
[Authorize(AuthenticationSchemes = "local")]
|
||
public sealed class ProfileCvController : ControllerBase
|
||
{
|
||
private static readonly HashSet<string> AllowedExtensions = new(StringComparer.OrdinalIgnoreCase)
|
||
{
|
||
".txt",
|
||
".md",
|
||
".pdf",
|
||
".docx",
|
||
".png",
|
||
".jpg",
|
||
".jpeg",
|
||
".webp",
|
||
};
|
||
|
||
private static readonly Dictionary<string, string> SectionAliases = new(StringComparer.OrdinalIgnoreCase)
|
||
{
|
||
["professional summary"] = "Professional Summary",
|
||
["summary"] = "Professional Summary",
|
||
["profile"] = "Professional Summary",
|
||
["about me"] = "Professional Summary",
|
||
["contact"] = "Contact",
|
||
["contact details"] = "Contact",
|
||
["core skills"] = "Skills",
|
||
["skills"] = "Skills",
|
||
["technical skills"] = "Skills",
|
||
["experience"] = "Work Experience",
|
||
["experience highlights"] = "Work Experience",
|
||
["work experience"] = "Work Experience",
|
||
["employment history"] = "Work Experience",
|
||
["selected achievements"] = "Selected Achievements",
|
||
["achievements"] = "Selected Achievements",
|
||
["projects"] = "Projects",
|
||
["education"] = "Education",
|
||
["certifications"] = "Certifications",
|
||
["certificates"] = "Certifications",
|
||
["languages"] = "Languages",
|
||
["interests"] = "Interests",
|
||
["hobbies"] = "Interests",
|
||
};
|
||
|
||
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
|
||
private const string ParserVersion = "m005-s01";
|
||
private const string NormalizerVersion = "m005-s01";
|
||
private const string LlmPromptVersion = "m005-s01";
|
||
|
||
private readonly UserManager<ApplicationUser> _users;
|
||
private readonly ISummarizerService _aiService;
|
||
private readonly ICvAiClassifier _cvAiClassifier;
|
||
private readonly ICvAiNormalizer _cvAiNormalizer;
|
||
private readonly JobTrackerContext _db;
|
||
private readonly AppPaths _paths;
|
||
private readonly ILogger<ProfileCvController> _logger;
|
||
private readonly ICvTemplateRenderer _cvTemplateRenderer;
|
||
private readonly ICvPdfExporter _cvPdfExporter;
|
||
private readonly ICvProcessingQueue _cvProcessingQueue;
|
||
private readonly IAppEmailSender _emailSender;
|
||
|
||
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ILogger<ProfileCvController>? logger = null, ICvAiClassifier? cvAiClassifier = null, ICvAiNormalizer? cvAiNormalizer = null, ICvTemplateRenderer? cvTemplateRenderer = null, ICvPdfExporter? cvPdfExporter = null, ICvProcessingQueue? cvProcessingQueue = null, IAppEmailSender? emailSender = null)
|
||
{
|
||
_users = users;
|
||
_aiService = aiService;
|
||
_cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance;
|
||
_cvAiNormalizer = cvAiNormalizer ?? NoOpCvAiNormalizer.Instance;
|
||
_db = db;
|
||
_paths = paths;
|
||
_logger = logger ?? Microsoft.Extensions.Logging.Abstractions.NullLogger<ProfileCvController>.Instance;
|
||
_cvTemplateRenderer = cvTemplateRenderer ?? new CvTemplateRenderer();
|
||
_cvPdfExporter = cvPdfExporter ?? new ThrowingCvPdfExporter();
|
||
_cvProcessingQueue = cvProcessingQueue ?? NoOpCvProcessingQueue.Instance;
|
||
_emailSender = emailSender ?? NoOpEmailSender.Instance;
|
||
}
|
||
|
||
private sealed class NoOpEmailSender : IAppEmailSender
|
||
{
|
||
public static readonly NoOpEmailSender Instance = new();
|
||
public Task SendAsync(string toEmail, string subject, string bodyText, CancellationToken cancellationToken = default) => Task.CompletedTask;
|
||
}
|
||
|
||
private sealed class ThrowingCvPdfExporter : ICvPdfExporter
|
||
{
|
||
public Task<CvPdfArtifact> ExportAsync(TailoredCvRenderResult renderResult, CancellationToken cancellationToken)
|
||
{
|
||
throw new InvalidOperationException("CV PDF export is not configured for this controller instance.");
|
||
}
|
||
}
|
||
|
||
public sealed class RewriteSectionRequest
|
||
{
|
||
public string? SectionName { get; set; }
|
||
public string? Style { get; set; }
|
||
public string? TargetRole { get; set; }
|
||
public JsonElement? JobApplicationId { get; set; }
|
||
public string? TemplateId { get; set; }
|
||
public string? SourceText { get; set; }
|
||
public string? PromptBackground { get; set; }
|
||
public string? Tone { get; set; }
|
||
public string? Language { get; set; }
|
||
}
|
||
public sealed record ParseCvRequest(string? Text);
|
||
public sealed record CvTemplateDescriptor(string Id, string Title, string Tone, string AccentColor, string PreviewTagline, string PreviewSummary, List<string> PreviewBullets);
|
||
public sealed record ProfileCvPreviewDto(string TemplateId, string Html, string SuggestedFileName, string FullText, string RewrittenText, string? SectionName, StructuredCvProfile StructuredCv, TailoredCvDocument Document, string? TargetRole, int? JobApplicationId);
|
||
public sealed record CvRewriteFailureDto(string Code, string Message, string? Detail = null, string? LastAiError = null);
|
||
|
||
private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv);
|
||
private sealed record ClassifiedCvBlock(int Index, string OriginalBlock, string SectionName, string Content, CvBlockClassificationResult? Classification);
|
||
public sealed record CvExtractionRunListItem(
|
||
int Id,
|
||
string Trigger,
|
||
string Status,
|
||
string? ArtifactFileName,
|
||
DateTimeOffset StartedAtUtc,
|
||
DateTimeOffset? CompletedAtUtc,
|
||
DateTimeOffset? AppliedAtUtc,
|
||
string ParserVersion,
|
||
string NormalizerVersion,
|
||
string LlmPromptVersion,
|
||
string? ErrorMessage);
|
||
|
||
[HttpPost("upload")]
|
||
[RequestSizeLimit(MaxFileSizeBytes)]
|
||
public async Task<IActionResult> Upload([FromForm] IFormFile file)
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
if (file is null || file.Length == 0) return BadRequest("Select a CV file to upload.");
|
||
if (file.Length > MaxFileSizeBytes) return BadRequest("CV import file is too large. Keep it under 5 MB.");
|
||
|
||
var extension = Path.GetExtension(file.FileName ?? string.Empty);
|
||
if (!AllowedExtensions.Contains(extension))
|
||
{
|
||
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
|
||
}
|
||
|
||
var artifact = await SaveUploadArtifactAsync(user, file, HttpContext.RequestAborted);
|
||
_db.CvUploadArtifacts.Add(artifact);
|
||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||
|
||
var run = new CvExtractionRun
|
||
{
|
||
OwnerUserId = user.Id,
|
||
ArtifactId = artifact.Id,
|
||
Trigger = "upload",
|
||
ParserVersion = ParserVersion,
|
||
NormalizerVersion = NormalizerVersion,
|
||
LlmPromptVersion = LlmPromptVersion,
|
||
Status = "running",
|
||
StartedAtUtc = DateTimeOffset.UtcNow,
|
||
};
|
||
_db.CvExtractionRuns.Add(run);
|
||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||
|
||
try
|
||
{
|
||
var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted);
|
||
result.StructuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
|
||
result.StructuredCv.Metadata.AppliedExtractionRunId = run.Id;
|
||
result.StructuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
|
||
var structuredJson = StructuredCvProfileJson.Serialize(result.StructuredCv);
|
||
|
||
run.RawExtractedText = result.RawText;
|
||
run.NormalizedText = result.NormalizedText;
|
||
run.StructuredProfileJson = structuredJson;
|
||
run.Status = "applied";
|
||
run.CompletedAtUtc = DateTimeOffset.UtcNow;
|
||
run.AppliedAtUtc = run.CompletedAtUtc;
|
||
|
||
user.ProfileCvText = result.NormalizedText;
|
||
user.ProfileCvStructureJson = structuredJson;
|
||
user.CurrentCvUploadArtifactId = artifact.Id;
|
||
user.CurrentCvExtractionRunId = run.Id;
|
||
user.CurrentCvProfileVersion = result.StructuredCv.Metadata.ProfileVersion;
|
||
|
||
var update = await _users.UpdateAsync(user);
|
||
if (!update.Succeeded)
|
||
{
|
||
run.Status = "failed";
|
||
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
|
||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||
return BadRequest(run.ErrorMessage);
|
||
}
|
||
|
||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||
|
||
return Ok(new
|
||
{
|
||
imported = true,
|
||
characters = result.NormalizedText.Length,
|
||
structuredCv = result.StructuredCv,
|
||
sections = result.StructuredCv.Sections,
|
||
artifactId = artifact.Id,
|
||
extractionRunId = run.Id,
|
||
profileVersion = result.StructuredCv.Metadata.ProfileVersion,
|
||
});
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
run.Status = "failed";
|
||
run.ErrorMessage = ex.Message;
|
||
run.CompletedAtUtc = DateTimeOffset.UtcNow;
|
||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||
throw;
|
||
}
|
||
}
|
||
|
||
[HttpGet("runs")]
|
||
public async Task<ActionResult<IEnumerable<CvExtractionRunListItem>>> GetRuns()
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
|
||
var runs = await _db.CvExtractionRuns
|
||
.AsNoTracking()
|
||
.Where(x => x.OwnerUserId == user.Id)
|
||
.OrderByDescending(x => x.StartedAtUtc)
|
||
.Take(10)
|
||
.Select(x => new CvExtractionRunListItem(
|
||
x.Id,
|
||
x.Trigger,
|
||
x.Status,
|
||
x.Artifact != null ? x.Artifact.OriginalFileName : null,
|
||
x.StartedAtUtc,
|
||
x.CompletedAtUtc,
|
||
x.AppliedAtUtc,
|
||
x.ParserVersion,
|
||
x.NormalizerVersion,
|
||
x.LlmPromptVersion,
|
||
x.ErrorMessage))
|
||
.ToListAsync(HttpContext.RequestAborted);
|
||
|
||
return Ok(runs);
|
||
}
|
||
|
||
[HttpPost("reprocess")]
|
||
public async Task<IActionResult> Reprocess()
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
|
||
var artifact = await _db.CvUploadArtifacts
|
||
.OrderByDescending(x => x.UploadedAtUtc)
|
||
.FirstOrDefaultAsync(x => x.OwnerUserId == user.Id, HttpContext.RequestAborted);
|
||
|
||
if (artifact is null) return BadRequest("Upload a CV before reprocessing it.");
|
||
if (string.IsNullOrWhiteSpace(artifact.StoragePath) || !System.IO.File.Exists(artifact.StoragePath))
|
||
{
|
||
return BadRequest("The stored CV artifact could not be found for reprocessing.");
|
||
}
|
||
|
||
var run = await CreateQueuedRunAsync(user.Id, artifact.Id, "reprocess", HttpContext.RequestAborted);
|
||
await _cvProcessingQueue.EnqueueAsync(run.Id, HttpContext.RequestAborted);
|
||
return Accepted(new { queued = true, extractionRunId = run.Id, status = run.Status });
|
||
}
|
||
|
||
[HttpPost("rebuild")]
|
||
public async Task<IActionResult> Rebuild()
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before rebuilding it.");
|
||
|
||
var run = await CreateQueuedRunAsync(user.Id, user.CurrentCvUploadArtifactId, "rebuild", HttpContext.RequestAborted);
|
||
await _cvProcessingQueue.EnqueueAsync(run.Id, HttpContext.RequestAborted);
|
||
return Accepted(new { queued = true, extractionRunId = run.Id, status = run.Status });
|
||
}
|
||
|
||
[HttpPost("rewrite-section")]
|
||
public async Task<IActionResult> RewriteSection([FromBody] RewriteSectionRequest request)
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
|
||
var structuredCv = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson);
|
||
var sourceText = string.IsNullOrWhiteSpace(request.SourceText)
|
||
? (string.IsNullOrWhiteSpace(user.ProfileCvText) ? null : user.ProfileCvText.Trim())
|
||
: request.SourceText.Trim();
|
||
if (string.IsNullOrWhiteSpace(sourceText) && structuredCv.Sections.Count == 0)
|
||
{
|
||
return BadRequest("Add or import CV text before rewriting your CV.");
|
||
}
|
||
|
||
var sectionName = NormalizeRewriteSectionName(request.SectionName);
|
||
var style = string.IsNullOrWhiteSpace(request.Style) ? "ats-minimal" : request.Style.Trim();
|
||
var templateId = NormalizeTemplateId(request.TemplateId ?? style);
|
||
var targetRole = string.IsNullOrWhiteSpace(request.TargetRole) ? null : request.TargetRole.Trim();
|
||
var tone = string.IsNullOrWhiteSpace(request.Tone) ? null : request.Tone.Trim();
|
||
var language = string.IsNullOrWhiteSpace(request.Language) ? null : request.Language.Trim();
|
||
var promptBackground = string.IsNullOrWhiteSpace(request.PromptBackground) ? null : request.PromptBackground.Trim();
|
||
var jobApplicationId = ParseFlexibleNullableInt(request.JobApplicationId);
|
||
var jobContext = jobApplicationId.HasValue
|
||
? await _db.JobApplications
|
||
.AsNoTracking()
|
||
.Include(job => job.Company)
|
||
.Where(job => job.Id == jobApplicationId.Value && job.OwnerUserId == user.Id)
|
||
.Select(job => new
|
||
{
|
||
job.Id,
|
||
job.JobTitle,
|
||
job.Description,
|
||
job.TranslatedDescription,
|
||
job.ShortSummary,
|
||
job.Notes,
|
||
job.JobUrl,
|
||
job.Status,
|
||
CompanyName = job.Company != null ? job.Company.Name : null,
|
||
RecruiterName = job.Company != null ? job.Company.RecruiterName : null,
|
||
RecruiterEmail = job.Company != null ? job.Company.RecruiterEmail : null
|
||
})
|
||
.FirstOrDefaultAsync(HttpContext.RequestAborted)
|
||
: null;
|
||
|
||
var effectiveTargetRole = targetRole ?? jobContext?.JobTitle;
|
||
var rewriteSource = BuildRewriteSourceText(sectionName, sourceText, structuredCv);
|
||
var templateGuidance = DescribeRewriteTemplate(templateId);
|
||
var roleGuidance = jobContext is not null
|
||
? $"Target this toward the saved job '{jobContext.JobTitle}' at '{jobContext.CompanyName ?? "Unknown company"}'. Use the full job record below to sharpen wording without inventing facts.\nJob status: {jobContext.Status}\nJob summary: {jobContext.ShortSummary ?? "-"}\nJob description: {jobContext.Description ?? "-"}\nTranslated description: {jobContext.TranslatedDescription ?? "-"}\nNotes: {jobContext.Notes ?? "-"}\nJob URL: {jobContext.JobUrl ?? "-"}\nRecruiter name: {jobContext.RecruiterName ?? "-"}\nRecruiter email: {jobContext.RecruiterEmail ?? "-"}"
|
||
: effectiveTargetRole is not null
|
||
? $"Target role: {effectiveTargetRole}. Keep it broadly reusable but clearly aligned to that role family."
|
||
: "Keep it broadly reusable for future tailoring.";
|
||
var toneGuidance = tone is not null ? $"Tone guidance: {tone}." : "Tone guidance: confident, professional, concise, and factual.";
|
||
var languageGuidance = language is not null ? $"Write the CV in {language}." : "Write the CV in English unless the source clearly requires another language.";
|
||
var backgroundGuidance = promptBackground is not null ? $"Candidate background and emphasis: {promptBackground}" : string.Empty;
|
||
|
||
var subject = sectionName is null ? "this CV" : $"the '{sectionName}' section of this CV";
|
||
var instruction = $"Rewrite only {subject}. Preserve facts, avoid inventing employers, titles, qualifications, dates, locations, salaries, or metrics. Style guidance: {style}. Template direction: {templateGuidance}. {roleGuidance} {toneGuidance} {languageGuidance} {backgroundGuidance} Return only the rewritten CV text with clean headings and strong bullet phrasing when useful.";
|
||
var rewritten = await _aiService.SummarizeSectionAsync(
|
||
instruction,
|
||
rewriteSource,
|
||
sectionName is null ? 1800 : 900,
|
||
sectionName is null ? 400 : 180);
|
||
|
||
if (string.IsNullOrWhiteSpace(rewritten))
|
||
{
|
||
var metrics = await _aiService.GetMetricsAsync(HttpContext.RequestAborted);
|
||
var detail = metrics.Healthy
|
||
? "The rewrite request reached the AI service, but it returned no usable text."
|
||
: "The AI rewrite service is unavailable or not ready.";
|
||
var failureCode = metrics.Healthy ? "rewrite-empty" : "ai-service-unavailable";
|
||
var message = metrics.Healthy
|
||
? "The AI service returned an empty CV rewrite."
|
||
: "The AI service could not rewrite your CV right now.";
|
||
|
||
_logger.LogWarning("CV rewrite returned empty output. Section={SectionName} Template={TemplateId} TargetRole={TargetRole} JobApplicationId={JobApplicationId} HasSourceText={HasSourceText} StructuredSections={StructuredSectionCount} AiHealthy={AiHealthy} AiLastError={AiLastError}",
|
||
sectionName ?? "<whole-cv>", templateId, effectiveTargetRole ?? "<none>", jobApplicationId, !string.IsNullOrWhiteSpace(sourceText), structuredCv.Sections.Count, metrics.Healthy, metrics.LastError ?? "<none>");
|
||
|
||
return StatusCode(StatusCodes.Status502BadGateway, new CvRewriteFailureDto(
|
||
failureCode,
|
||
message,
|
||
detail,
|
||
metrics.LastError));
|
||
}
|
||
|
||
return Ok(new
|
||
{
|
||
sectionName,
|
||
style,
|
||
templateId,
|
||
targetRole = effectiveTargetRole,
|
||
jobApplicationId = jobContext?.Id,
|
||
text = rewritten.Trim()
|
||
});
|
||
}
|
||
|
||
[HttpGet("templates")]
|
||
public ActionResult<IEnumerable<CvTemplateDescriptor>> GetTemplates()
|
||
{
|
||
return Ok(GetCvTemplateDescriptors());
|
||
}
|
||
|
||
[HttpPost("rewrite-preview")]
|
||
public async Task<ActionResult<ProfileCvPreviewDto>> BuildRewritePreview([FromBody] RewriteSectionRequest request)
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
|
||
var structuredCv = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson);
|
||
var sourceText = string.IsNullOrWhiteSpace(request.SourceText)
|
||
? (string.IsNullOrWhiteSpace(user.ProfileCvText) ? null : user.ProfileCvText.Trim())
|
||
: request.SourceText.Trim();
|
||
if (string.IsNullOrWhiteSpace(sourceText) && structuredCv.Sections.Count == 0)
|
||
{
|
||
return BadRequest("Add or import CV text before rewriting your CV.");
|
||
}
|
||
|
||
var sectionName = NormalizeRewriteSectionName(request.SectionName);
|
||
var style = string.IsNullOrWhiteSpace(request.Style) ? "ats-minimal" : request.Style.Trim();
|
||
var templateId = NormalizeTemplateId(request.TemplateId ?? style);
|
||
var jobApplicationId = ParseFlexibleNullableInt(request.JobApplicationId);
|
||
var job = jobApplicationId.HasValue
|
||
? await _db.JobApplications.AsNoTracking().Include(job => job.Company)
|
||
.FirstOrDefaultAsync(job => job.Id == jobApplicationId.Value && job.OwnerUserId == user.Id, HttpContext.RequestAborted)
|
||
: null;
|
||
var effectiveTargetRole = string.IsNullOrWhiteSpace(request.TargetRole)
|
||
? job?.JobTitle
|
||
: request.TargetRole.Trim();
|
||
|
||
var rewriteResult = await RewriteSection(request);
|
||
if (rewriteResult is not OkObjectResult ok) return StatusCode((rewriteResult as ObjectResult)?.StatusCode ?? 500, (rewriteResult as ObjectResult)?.Value);
|
||
|
||
var rewrittenText = JsonDocument.Parse(JsonSerializer.Serialize(ok.Value)).RootElement.GetProperty("text").GetString()?.Trim() ?? string.Empty;
|
||
var baseText = string.IsNullOrWhiteSpace(sourceText)
|
||
? string.Join("\n\n", structuredCv.Sections.Select(section => $"## {section.Name}\n{section.Content}"))
|
||
: sourceText!;
|
||
var fullText = sectionName is null ? rewrittenText : ReplaceOrAppendCvSection(baseText, sectionName, rewrittenText);
|
||
var previewStructured = await BuildStructuredCvAsync(fullText, HttpContext.RequestAborted);
|
||
var document = BuildMasterCvDocument(previewStructured, templateId, effectiveTargetRole, job?.JobTitle, job?.Company?.Name);
|
||
var rendered = RenderProfileCv(document, user, effectiveTargetRole ?? user.DisplayName ?? "General CV", job?.Company?.Name);
|
||
|
||
return Ok(new ProfileCvPreviewDto(rendered.TemplateId, rendered.Html, rendered.SuggestedFileName, fullText, rewrittenText, sectionName, previewStructured, document, effectiveTargetRole, job?.Id));
|
||
}
|
||
|
||
[HttpPost("export-pdf")]
|
||
public async Task<IActionResult> ExportProfileCvPdf([FromBody] RewriteSectionRequest request, CancellationToken cancellationToken)
|
||
{
|
||
var previewResult = await BuildRewritePreview(request);
|
||
if (previewResult.Result is ObjectResult errorResult && errorResult.StatusCode >= 400)
|
||
{
|
||
return StatusCode(errorResult.StatusCode ?? 500, errorResult.Value);
|
||
}
|
||
|
||
var ok = previewResult.Result as OkObjectResult;
|
||
if (ok?.Value is not ProfileCvPreviewDto preview)
|
||
{
|
||
return StatusCode(StatusCodes.Status500InternalServerError, "The CV preview could not be prepared for PDF export.");
|
||
}
|
||
|
||
var artifact = await _cvPdfExporter.ExportAsync(new TailoredCvRenderResult(preview.TemplateId, preview.SuggestedFileName, preview.Html), cancellationToken);
|
||
return File(artifact.Bytes, "application/pdf", artifact.FileName);
|
||
}
|
||
|
||
[HttpPost("parse")]
|
||
public async Task<ActionResult<object>> Parse([FromBody] ParseCvRequest? request)
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
|
||
var source = string.IsNullOrWhiteSpace(request?.Text) ? user.ProfileCvText : request!.Text;
|
||
if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections.");
|
||
|
||
var normalizedSource = await MaybeReconstructStructuredCvAsync(source, HttpContext.RequestAborted);
|
||
var structuredCv = await BuildStructuredCvAsync(normalizedSource, HttpContext.RequestAborted);
|
||
if (string.IsNullOrWhiteSpace(request?.Text))
|
||
{
|
||
user.ProfileCvText = normalizedSource;
|
||
}
|
||
await ApplyTextExtractionRunAsync(user, "parse", source, normalizedSource, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
|
||
|
||
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(normalizedSource), extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
|
||
}
|
||
|
||
[HttpPost("improve")]
|
||
public async Task<IActionResult> Improve()
|
||
{
|
||
var user = await _users.GetUserAsync(User);
|
||
if (user is null) return Unauthorized();
|
||
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before improving it.");
|
||
|
||
var run = await CreateQueuedRunAsync(user.Id, user.CurrentCvUploadArtifactId, "improve", HttpContext.RequestAborted);
|
||
await _cvProcessingQueue.EnqueueAsync(run.Id, HttpContext.RequestAborted);
|
||
return Accepted(new { queued = true, extractionRunId = run.Id, status = run.Status });
|
||
}
|
||
|
||
private static string BuildRewriteSourceText(string? sectionName, string? sourceText, StructuredCvProfile structuredCv)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(sectionName))
|
||
{
|
||
return !string.IsNullOrWhiteSpace(sourceText)
|
||
? sourceText.Trim()
|
||
: string.Join("\n\n", structuredCv.Sections.Select(section => $"## {section.Name}\n{section.Content}"));
|
||
}
|
||
|
||
var matchingSection = structuredCv.Sections.FirstOrDefault(section => string.Equals(section.Name, sectionName, StringComparison.OrdinalIgnoreCase));
|
||
if (matchingSection is not null && !string.IsNullOrWhiteSpace(matchingSection.Content))
|
||
{
|
||
return $"## {matchingSection.Name}\n{matchingSection.Content}";
|
||
}
|
||
|
||
return !string.IsNullOrWhiteSpace(sourceText)
|
||
? sourceText.Trim()
|
||
: string.Join("\n\n", structuredCv.Sections.Select(section => $"## {section.Name}\n{section.Content}"));
|
||
}
|
||
|
||
private static string DescribeRewriteTemplate(string templateId)
|
||
{
|
||
return templateId.ToLowerInvariant() switch
|
||
{
|
||
"harvard" => "Harvard template: refined, traditional, strong hierarchy, restrained and credible.",
|
||
"auckland" => "Auckland template: modern sidebar layout, crisp highlights, confident but readable.",
|
||
"edinburgh" => "Edinburgh template: polished editorial layout with stronger visual personality and premium spacing.",
|
||
"monarch" => "Monarch template: executive, premium, high-contrast emphasis on summary and leadership signals.",
|
||
"fjord" => "Fjord template: calm technical layout with clear information density and practical scanability.",
|
||
_ => "ATS Minimal template: clean, compact, scanner-friendly, and easy to tailor."
|
||
};
|
||
}
|
||
|
||
private static string NormalizeTemplateId(string? value)
|
||
{
|
||
var normalized = (value ?? string.Empty).Trim().ToLowerInvariant();
|
||
return normalized switch
|
||
{
|
||
"base" => "ats-minimal",
|
||
"legacy-text" => "ats-minimal",
|
||
"harvard" => "harvard",
|
||
"auckland" => "auckland",
|
||
"edinburgh" => "edinburgh",
|
||
"monarch" => "monarch",
|
||
"fjord" => "fjord",
|
||
_ => "ats-minimal"
|
||
};
|
||
}
|
||
|
||
private static string? NormalizeRewriteSectionName(string? value)
|
||
{
|
||
var trimmed = value?.Trim();
|
||
if (string.IsNullOrWhiteSpace(trimmed)) return null;
|
||
return SectionAliases.TryGetValue(trimmed, out var canonical) ? canonical : trimmed;
|
||
}
|
||
|
||
private static int? ParseFlexibleNullableInt(JsonElement? value)
|
||
{
|
||
if (value is null) return null;
|
||
if (value.Value.ValueKind == JsonValueKind.Number && value.Value.TryGetInt32(out var number)) return number;
|
||
if (value.Value.ValueKind == JsonValueKind.String)
|
||
{
|
||
var raw = value.Value.GetString();
|
||
if (int.TryParse(raw, out var parsed)) return parsed;
|
||
}
|
||
return null;
|
||
}
|
||
|
||
private static string ReplaceOrAppendCvSection(string source, string sectionName, string sectionDraft)
|
||
{
|
||
var trimmedSource = (source ?? string.Empty).Trim();
|
||
var trimmedDraft = (sectionDraft ?? string.Empty).Trim();
|
||
if (string.IsNullOrWhiteSpace(trimmedDraft)) return trimmedSource;
|
||
if (string.IsNullOrWhiteSpace(trimmedSource)) return $"## {sectionName}\n{trimmedDraft}";
|
||
|
||
var normalizedHeading = sectionName.Trim().ToLowerInvariant();
|
||
var headingPattern = new Regex(@"^(##\s+|#\s+)?(?<name>[A-Z][A-Za-z &/]+):?\s*$", RegexOptions.Multiline);
|
||
var matches = headingPattern.Matches(trimmedSource).ToList();
|
||
var targetIndex = matches.FindIndex(match => string.Equals(match.Groups["name"].Value.Trim(), normalizedHeading, StringComparison.OrdinalIgnoreCase));
|
||
if (targetIndex < 0)
|
||
{
|
||
return $"{trimmedSource}\n\n## {sectionName}\n{trimmedDraft}".Trim();
|
||
}
|
||
|
||
var start = matches[targetIndex].Index;
|
||
var end = targetIndex + 1 < matches.Count ? matches[targetIndex + 1].Index : trimmedSource.Length;
|
||
var before = trimmedSource[..start].TrimEnd();
|
||
var after = trimmedSource[end..].TrimStart();
|
||
return string.Join("\n\n", new[] { before, $"## {sectionName}\n{trimmedDraft}", after }.Where(part => !string.IsNullOrWhiteSpace(part))).Trim();
|
||
}
|
||
|
||
private static IReadOnlyList<CvTemplateDescriptor> GetCvTemplateDescriptors()
|
||
{
|
||
return new[]
|
||
{
|
||
new CvTemplateDescriptor("ats-minimal", "ATS Minimal", "Scanner-friendly", "slate", "Compact, direct, and easy to parse.", "Best for broad application flows and recruiter scanning.", new List<string> { "Tight hierarchy", "Keyword-friendly", "Low visual risk" }),
|
||
new CvTemplateDescriptor("harvard", "Harvard", "Traditional", "brick", "Formal and restrained.", "Good for conservative hiring flows or academic-adjacent applications.", new List<string> { "Classic serif rhythm", "Strong chronology", "Credible tone" }),
|
||
new CvTemplateDescriptor("auckland", "Auckland", "Modern sidebar", "emerald", "Sharper highlights with a contemporary cadence.", "Pulls key strengths into a faster visual scan.", new List<string> { "Sidebar details", "Compact highlights", "Modern contrast" }),
|
||
new CvTemplateDescriptor("edinburgh", "Edinburgh", "Editorial", "plum", "More personality without losing clarity.", "Useful when the CV should feel polished and distinctive.", new List<string> { "Premium spacing", "Stronger personality", "Readable density" }),
|
||
new CvTemplateDescriptor("monarch", "Monarch", "Executive", "#7c2d12", "High-contrast leadership emphasis.", "Works well for senior, strategic, or client-facing roles.", new List<string> { "Executive summary weight", "Premium accenting", "Decision-maker friendly" }),
|
||
new CvTemplateDescriptor("fjord", "Fjord", "Technical", "#0f4c5c", "Calm, dense, technical layout.", "Optimized for engineering resumes with richer project and skills detail.", new List<string> { "Technical depth", "Dense but readable", "Practical hierarchy" }),
|
||
};
|
||
}
|
||
|
||
private TailoredCvRenderResult RenderProfileCv(TailoredCvDocument document, ApplicationUser user, string targetRole, string? companyName)
|
||
{
|
||
var candidateName = string.Join(" ", new[] { user.FirstName?.Trim(), user.LastName?.Trim() }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||
if (string.IsNullOrWhiteSpace(candidateName)) candidateName = user.DisplayName?.Trim();
|
||
if (string.IsNullOrWhiteSpace(candidateName)) candidateName = user.UserName?.Trim();
|
||
if (string.IsNullOrWhiteSpace(candidateName)) candidateName = user.Email?.Trim();
|
||
if (string.IsNullOrWhiteSpace(candidateName)) candidateName = "Your Name";
|
||
return _cvTemplateRenderer.Render(document, document.TemplateId, candidateName!, targetRole, companyName, user.AvatarImageDataUrl);
|
||
}
|
||
|
||
private static TailoredCvDocument BuildMasterCvDocument(StructuredCvProfile structuredCv, string templateId, string? targetRole, string? fallbackHeadline, string? companyName)
|
||
{
|
||
var normalized = StructuredCvProfileJson.Normalize(structuredCv);
|
||
var customSections = new List<TailoredCvCustomSection>();
|
||
if (normalized.Certifications.Count > 0)
|
||
{
|
||
customSections.Add(new TailoredCvCustomSection
|
||
{
|
||
Title = "Certifications",
|
||
Items = normalized.Certifications.Select(certification => string.Join(" | ", new[] { certification.Name, certification.Issuer, certification.Location, certification.Date }.Where(value => !string.IsNullOrWhiteSpace(value)))).Where(value => !string.IsNullOrWhiteSpace(value)).ToList(),
|
||
});
|
||
}
|
||
if (normalized.Projects.Count > 0)
|
||
{
|
||
customSections.Add(new TailoredCvCustomSection
|
||
{
|
||
Title = "Projects",
|
||
Items = normalized.Projects.Select(project => string.Join(" | ", new[] { project.Name, project.Role, project.Location, FormatDateRangeForSection(project.Start, project.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value)))).Where(value => !string.IsNullOrWhiteSpace(value)).ToList(),
|
||
});
|
||
}
|
||
if (normalized.Languages.Count > 0)
|
||
{
|
||
customSections.Add(new TailoredCvCustomSection
|
||
{
|
||
Title = "Languages",
|
||
Items = normalized.Languages.Select(language => string.Join(": ", new[] { language.Name, language.Level }.Where(value => !string.IsNullOrWhiteSpace(value)))).Where(value => !string.IsNullOrWhiteSpace(value)).ToList(),
|
||
});
|
||
}
|
||
customSections.AddRange(normalized.OtherSections.Select(section => new TailoredCvCustomSection { Title = section.Title, Items = section.Items }));
|
||
|
||
return TailoredCvDraftJson.Normalize(new TailoredCvDocument
|
||
{
|
||
TemplateId = templateId,
|
||
Headline = normalized.Contact.Headline ?? targetRole ?? fallbackHeadline ?? companyName,
|
||
Summary = normalized.Summary,
|
||
SelectedSkills = normalized.Skills,
|
||
Experience = normalized.Jobs.Select(job => new TailoredCvExperienceItem
|
||
{
|
||
Title = job.Title,
|
||
Company = job.Company,
|
||
Location = job.Location,
|
||
Start = job.Start,
|
||
End = job.End,
|
||
IsCurrent = job.IsCurrent,
|
||
Bullets = job.Bullets,
|
||
}).ToList(),
|
||
Education = normalized.Education.Select(education => new TailoredCvEducationItem
|
||
{
|
||
Qualification = education.Qualification,
|
||
QualificationLevel = education.QualificationLevel,
|
||
Institution = education.Institution,
|
||
Location = education.Location,
|
||
Start = education.Start,
|
||
End = education.End,
|
||
Details = education.Details,
|
||
}).ToList(),
|
||
CustomSections = customSections,
|
||
RenderOptions = new TailoredCvRenderOptions
|
||
{
|
||
ShowPhoto = true,
|
||
AccentColor = templateId switch
|
||
{
|
||
"harvard" => "brick",
|
||
"auckland" => "emerald",
|
||
"edinburgh" => "plum",
|
||
"monarch" => "#7c2d12",
|
||
"fjord" => "#0f4c5c",
|
||
_ => "slate",
|
||
},
|
||
SectionOrder = new List<string> { "summary", "skills", "experience", "education", "custom" },
|
||
}
|
||
});
|
||
}
|
||
|
||
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||
{
|
||
if (LooksLikeNormalizedMarkdownCv(text))
|
||
{
|
||
var normalized = BuildStructuredCvFromNormalizedMarkdown(text);
|
||
AnnotateStructuredCv(normalized, "normalized-markdown", 0.78);
|
||
return StructuredCvProfileJson.Normalize(normalized);
|
||
}
|
||
|
||
var parseSource = NormalizeTextForStructuredParsing(text);
|
||
var parsedSections = ParseSections(parseSource)
|
||
.Select(section => new StructuredCvSection
|
||
{
|
||
Name = section.Name,
|
||
Content = section.Content,
|
||
WordCount = CountWords(section.Content),
|
||
})
|
||
.ToList();
|
||
var hasRealSections = parsedSections.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
|
||
|
||
List<ClassifiedCvBlock> classifiedBlocks = new();
|
||
List<StructuredCvSection> fallbackSections = parsedSections;
|
||
StructuredCvProfile? classifierFallback = null;
|
||
|
||
if (!hasRealSections)
|
||
{
|
||
classifiedBlocks = await ClassifyBlocksAsync(parseSource, cancellationToken);
|
||
var hasMeaningfulClassifierStructure = classifiedBlocks.Any(block => !string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase));
|
||
if (hasMeaningfulClassifierStructure)
|
||
{
|
||
fallbackSections = BuildSectionsFromClassifiedBlocks(classifiedBlocks);
|
||
classifierFallback = BuildStructuredCvFromClassifiedBlocks(classifiedBlocks);
|
||
}
|
||
}
|
||
|
||
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
|
||
AnnotateStructuredCv(sectionFallback, "repair", 0.56);
|
||
var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text);
|
||
AnnotateStructuredCv(heuristicFallback, "deterministic", 0.68);
|
||
heuristicFallback.Sections = new List<StructuredCvSection>();
|
||
var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
|
||
if (classifierFallback is not null)
|
||
{
|
||
fallback = StructuredCvProfileJson.Merge(classifierFallback, fallback);
|
||
}
|
||
fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
|
||
var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
|
||
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
|
||
merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email);
|
||
|
||
if (!IsPlausibleLocationValue(merged.Contact.Location, merged.Contact.FullName))
|
||
{
|
||
merged.Contact.Location = PreferDetectedLocation(text, null, merged.Contact.FullName);
|
||
}
|
||
|
||
merged.Jobs = merged.Jobs
|
||
.Where(job => !LooksLikePersonName(job.Title ?? string.Empty))
|
||
.ToList();
|
||
|
||
var reparsedJobs = ParseJobsHeuristically(text)
|
||
.Where(job => !LooksLikePersonName(job.Title ?? string.Empty))
|
||
.ToList();
|
||
var existingFirstTitle = merged.Jobs.FirstOrDefault()?.Title;
|
||
var reparsedFirstTitle = reparsedJobs.FirstOrDefault()?.Title;
|
||
|
||
if (LooksLikePersonName(existingFirstTitle ?? string.Empty)
|
||
&& LooksLikeRoleOrHeadline(reparsedFirstTitle ?? string.Empty)
|
||
&& ArePlausibleJobs(reparsedJobs, merged.Contact.FullName))
|
||
{
|
||
merged.Jobs = reparsedJobs;
|
||
}
|
||
else if (ArePlausibleJobs(merged.Jobs, merged.Contact.FullName))
|
||
{
|
||
if (ScoreJobs(reparsedJobs, merged.Contact.FullName) > ScoreJobs(merged.Jobs, merged.Contact.FullName))
|
||
{
|
||
merged.Jobs = reparsedJobs;
|
||
}
|
||
}
|
||
else if (ArePlausibleJobs(reparsedJobs, merged.Contact.FullName))
|
||
{
|
||
merged.Jobs = reparsedJobs;
|
||
}
|
||
|
||
return StructuredCvProfileJson.Normalize(merged);
|
||
}
|
||
|
||
private async Task<CvUploadArtifact> SaveUploadArtifactAsync(ApplicationUser user, IFormFile file, CancellationToken cancellationToken)
|
||
{
|
||
var extension = Path.GetExtension(file.FileName ?? string.Empty);
|
||
var userRoot = Path.Combine(_paths.CvArtifactsRoot, user.Id);
|
||
Directory.CreateDirectory(userRoot);
|
||
|
||
var storedFileName = $"{DateTimeOffset.UtcNow:yyyyMMddHHmmss}-{Guid.NewGuid():N}{extension}";
|
||
var storagePath = Path.Combine(userRoot, storedFileName);
|
||
|
||
await using (var target = System.IO.File.Create(storagePath))
|
||
await using (var source = file.OpenReadStream())
|
||
{
|
||
await source.CopyToAsync(target, cancellationToken);
|
||
}
|
||
|
||
await using var hashStream = System.IO.File.OpenRead(storagePath);
|
||
var shaBytes = await SHA256.HashDataAsync(hashStream, cancellationToken);
|
||
|
||
return new CvUploadArtifact
|
||
{
|
||
OwnerUserId = user.Id,
|
||
OriginalFileName = file.FileName ?? storedFileName,
|
||
StoredFileName = storedFileName,
|
||
MimeType = file.ContentType ?? "application/octet-stream",
|
||
ByteSize = file.Length,
|
||
Sha256 = Convert.ToHexString(shaBytes),
|
||
StoragePath = storagePath,
|
||
UploadedAtUtc = DateTimeOffset.UtcNow,
|
||
};
|
||
}
|
||
|
||
private async Task<ExtractionPipelineResult> ExtractStructuredCvFromFileAsync(IFormFile file, string extension, CancellationToken cancellationToken)
|
||
{
|
||
string text;
|
||
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
|
||
|
||
if (canUseAiExtraction)
|
||
{
|
||
await using var uploadStream = file.OpenReadStream();
|
||
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, cancellationToken);
|
||
text = extracted?.Text?.Trim() ?? string.Empty;
|
||
}
|
||
else
|
||
{
|
||
text = string.Empty;
|
||
}
|
||
|
||
if (string.IsNullOrWhiteSpace(text))
|
||
{
|
||
text = (await ExtractTextAsync(file, extension)).Trim();
|
||
}
|
||
if (string.IsNullOrWhiteSpace(text))
|
||
{
|
||
throw new InvalidOperationException("The uploaded CV file could not be read or was empty.");
|
||
}
|
||
|
||
var normalizedText = (await MaybeReconstructStructuredCvAsync(text, cancellationToken)).Trim();
|
||
var structuredCv = await BuildStructuredCvAsync(normalizedText, cancellationToken);
|
||
return new ExtractionPipelineResult(text, normalizedText, structuredCv);
|
||
}
|
||
|
||
private async Task ApplyTextExtractionRunAsync(ApplicationUser user, string trigger, string rawText, string normalizedText, StructuredCvProfile structuredCv, int? artifactId, CancellationToken cancellationToken)
|
||
{
|
||
var run = new CvExtractionRun
|
||
{
|
||
OwnerUserId = user.Id,
|
||
ArtifactId = artifactId,
|
||
Trigger = trigger,
|
||
ParserVersion = ParserVersion,
|
||
NormalizerVersion = NormalizerVersion,
|
||
LlmPromptVersion = LlmPromptVersion,
|
||
Status = "applied",
|
||
RawExtractedText = rawText,
|
||
NormalizedText = normalizedText,
|
||
StartedAtUtc = DateTimeOffset.UtcNow,
|
||
CompletedAtUtc = DateTimeOffset.UtcNow,
|
||
AppliedAtUtc = DateTimeOffset.UtcNow,
|
||
};
|
||
_db.CvExtractionRuns.Add(run);
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
|
||
structuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
|
||
structuredCv.Metadata.AppliedExtractionRunId = run.Id;
|
||
structuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
|
||
var structuredJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||
run.StructuredProfileJson = structuredJson;
|
||
|
||
user.ProfileCvText = normalizedText;
|
||
user.ProfileCvStructureJson = structuredJson;
|
||
user.CurrentCvExtractionRunId = run.Id;
|
||
user.CurrentCvProfileVersion = structuredCv.Metadata.ProfileVersion;
|
||
if (artifactId.HasValue)
|
||
{
|
||
user.CurrentCvUploadArtifactId = artifactId.Value;
|
||
}
|
||
|
||
var update = await _users.UpdateAsync(user);
|
||
if (!update.Succeeded)
|
||
{
|
||
run.Status = "failed";
|
||
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
throw new InvalidOperationException(run.ErrorMessage);
|
||
}
|
||
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
}
|
||
|
||
private async Task<CvExtractionRun> CreateQueuedRunAsync(string ownerUserId, int? artifactId, string trigger, CancellationToken cancellationToken)
|
||
{
|
||
var run = new CvExtractionRun
|
||
{
|
||
OwnerUserId = ownerUserId,
|
||
ArtifactId = artifactId,
|
||
Trigger = trigger,
|
||
ParserVersion = ParserVersion,
|
||
NormalizerVersion = NormalizerVersion,
|
||
LlmPromptVersion = LlmPromptVersion,
|
||
Status = "queued",
|
||
StartedAtUtc = DateTimeOffset.UtcNow,
|
||
};
|
||
_db.CvExtractionRuns.Add(run);
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
return run;
|
||
}
|
||
|
||
public async Task ProcessQueuedRunAsync(int runId, CancellationToken cancellationToken)
|
||
{
|
||
var run = await _db.CvExtractionRuns.FirstOrDefaultAsync(x => x.Id == runId, cancellationToken);
|
||
if (run is null) return;
|
||
var user = await _users.FindByIdAsync(run.OwnerUserId);
|
||
if (user is null)
|
||
{
|
||
run.Status = "failed";
|
||
run.ErrorMessage = "CV processing user was not found.";
|
||
run.CompletedAtUtc = DateTimeOffset.UtcNow;
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
return;
|
||
}
|
||
|
||
run.Status = "running";
|
||
run.ErrorMessage = null;
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
|
||
try
|
||
{
|
||
switch (run.Trigger)
|
||
{
|
||
case "rebuild":
|
||
{
|
||
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) throw new InvalidOperationException("Add or import CV text before rebuilding it.");
|
||
var rebuilt = await _aiService.SummarizeSectionAsync(
|
||
"Rewrite this CV into a stronger master CV with clear sections such as Professional Summary, Core Skills, Experience Highlights, and Selected Achievements. Preserve only factual claims, avoid inventing employers or metrics, and make the output clean and ready for tailoring to job applications. Return only the rebuilt CV text.",
|
||
user.ProfileCvText,
|
||
2200,
|
||
700);
|
||
if (string.IsNullOrWhiteSpace(rebuilt)) throw new InvalidOperationException("The AI service could not rebuild your CV text right now.");
|
||
|
||
var normalizedText = rebuilt.Trim();
|
||
var structuredCv = await BuildStructuredCvAsync(normalizedText, cancellationToken);
|
||
await ApplyQueuedRunResultAsync(run, user, normalizedText, normalizedText, structuredCv, run.ArtifactId, cancellationToken);
|
||
break;
|
||
}
|
||
case "improve":
|
||
{
|
||
if (string.IsNullOrWhiteSpace(user.ProfileCvText)) throw new InvalidOperationException("Add or import CV text before improving it.");
|
||
var improved = await _aiService.SummarizeSectionAsync(
|
||
"Rewrite this CV into a cleaner, better-structured master CV profile. Preserve factual claims, employers, skills, and measurable results. Improve clarity, tighten wording, use strong bullet-style phrasing, and keep it ready for further tailoring to specific roles. Return only the improved CV text.",
|
||
user.ProfileCvText,
|
||
1800,
|
||
500);
|
||
if (string.IsNullOrWhiteSpace(improved)) throw new InvalidOperationException("The AI service could not improve your CV text right now.");
|
||
|
||
var normalizedText = improved.Trim();
|
||
var structuredCv = await BuildStructuredCvAsync(normalizedText, cancellationToken);
|
||
await ApplyQueuedRunResultAsync(run, user, normalizedText, normalizedText, structuredCv, run.ArtifactId, cancellationToken);
|
||
break;
|
||
}
|
||
case "reprocess":
|
||
{
|
||
var artifact = await _db.CvUploadArtifacts.FirstOrDefaultAsync(x => x.Id == run.ArtifactId && x.OwnerUserId == user.Id, cancellationToken);
|
||
if (artifact is null) throw new InvalidOperationException("Upload a CV before reprocessing it.");
|
||
if (string.IsNullOrWhiteSpace(artifact.StoragePath) || !System.IO.File.Exists(artifact.StoragePath))
|
||
{
|
||
throw new InvalidOperationException("The stored CV artifact could not be found for reprocessing.");
|
||
}
|
||
|
||
await using var stream = System.IO.File.OpenRead(artifact.StoragePath);
|
||
var file = new FormFile(stream, 0, stream.Length, "file", artifact.OriginalFileName)
|
||
{
|
||
Headers = new HeaderDictionary(),
|
||
ContentType = artifact.MimeType
|
||
};
|
||
var extension = Path.GetExtension(artifact.OriginalFileName ?? string.Empty);
|
||
var result = await ExtractStructuredCvFromFileAsync(file, extension, cancellationToken);
|
||
await ApplyQueuedRunResultAsync(run, user, result.RawText, result.NormalizedText, result.StructuredCv, artifact.Id, cancellationToken);
|
||
break;
|
||
}
|
||
default:
|
||
throw new InvalidOperationException($"Unsupported CV processing trigger '{run.Trigger}'.");
|
||
}
|
||
|
||
await SendRunCompletionEmailAsync(user, run, true, cancellationToken);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
run.Status = "failed";
|
||
run.ErrorMessage = ex.Message;
|
||
run.CompletedAtUtc = DateTimeOffset.UtcNow;
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
await SendRunCompletionEmailAsync(user, run, false, cancellationToken);
|
||
_logger.LogWarning(ex, "CV processing run {RunId} failed for user {UserId}", run.Id, user.Id);
|
||
}
|
||
}
|
||
|
||
private async Task ApplyQueuedRunResultAsync(CvExtractionRun run, ApplicationUser user, string rawText, string normalizedText, StructuredCvProfile structuredCv, int? artifactId, CancellationToken cancellationToken)
|
||
{
|
||
structuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
|
||
structuredCv.Metadata.AppliedExtractionRunId = run.Id;
|
||
structuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
|
||
var structuredJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||
|
||
run.RawExtractedText = rawText;
|
||
run.NormalizedText = normalizedText;
|
||
run.StructuredProfileJson = structuredJson;
|
||
run.Status = "applied";
|
||
run.CompletedAtUtc = DateTimeOffset.UtcNow;
|
||
run.AppliedAtUtc = run.CompletedAtUtc;
|
||
|
||
user.ProfileCvText = normalizedText;
|
||
user.ProfileCvStructureJson = structuredJson;
|
||
user.CurrentCvExtractionRunId = run.Id;
|
||
user.CurrentCvProfileVersion = structuredCv.Metadata.ProfileVersion;
|
||
if (artifactId.HasValue)
|
||
{
|
||
user.CurrentCvUploadArtifactId = artifactId.Value;
|
||
}
|
||
|
||
var update = await _users.UpdateAsync(user);
|
||
if (!update.Succeeded)
|
||
{
|
||
run.Status = "failed";
|
||
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
throw new InvalidOperationException(run.ErrorMessage);
|
||
}
|
||
|
||
await _db.SaveChangesAsync(cancellationToken);
|
||
}
|
||
|
||
private async Task SendRunCompletionEmailAsync(ApplicationUser user, CvExtractionRun run, bool success, CancellationToken cancellationToken)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(user.Email)) return;
|
||
|
||
var subject = success ? $"Your CV {run.Trigger} is complete" : $"Your CV {run.Trigger} failed";
|
||
var body = success
|
||
? $"Your CV {run.Trigger} request finished successfully.\n\nRun ID: {run.Id}\nStatus: {run.Status}\nCompleted: {run.CompletedAtUtc:O}\n"
|
||
: $"Your CV {run.Trigger} request failed.\n\nRun ID: {run.Id}\nStatus: {run.Status}\nError: {run.ErrorMessage}\nCompleted: {run.CompletedAtUtc:O}\n";
|
||
|
||
try
|
||
{
|
||
await _emailSender.SendAsync(user.Email, subject, body, cancellationToken);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
_logger.LogWarning(ex, "CV processing completion email failed for run {RunId} user {UserId}", run.Id, user.Id);
|
||
}
|
||
}
|
||
|
||
private static void AnnotateStructuredCv(StructuredCvProfile profile, string method, double confidence)
|
||
{
|
||
var now = DateTimeOffset.UtcNow;
|
||
profile.Metadata ??= new StructuredCvMetadata();
|
||
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
|
||
|
||
void SetIf(string key, string? value)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(value)) return;
|
||
profile.Metadata.Fields[key] = new StructuredCvFieldMetadata
|
||
{
|
||
Confidence = confidence,
|
||
Method = method,
|
||
SourceSnippet = value.Length > 180 ? value[..180] : value,
|
||
ReviewState = "suggested",
|
||
LastUpdatedAtUtc = now,
|
||
};
|
||
}
|
||
|
||
SetIf("contact.fullName", profile.Contact.FullName);
|
||
SetIf("contact.headline", profile.Contact.Headline);
|
||
SetIf("contact.email", profile.Contact.Email);
|
||
SetIf("contact.phone", profile.Contact.Phone);
|
||
SetIf("contact.location", profile.Contact.Location);
|
||
SetIf("contact.website", profile.Contact.Website);
|
||
SetIf("contact.linkedIn", profile.Contact.LinkedIn);
|
||
SetIf("summary", profile.Summary.FirstOrDefault());
|
||
SetIf("skills", profile.Skills.FirstOrDefault());
|
||
SetIf("languages", profile.Languages.FirstOrDefault()?.Name);
|
||
SetIf("interests", profile.Interests.FirstOrDefault());
|
||
SetIf("jobs", profile.Jobs.FirstOrDefault()?.Title ?? profile.Jobs.FirstOrDefault()?.Company);
|
||
SetIf("education", profile.Education.FirstOrDefault()?.Qualification ?? profile.Education.FirstOrDefault()?.Institution);
|
||
}
|
||
|
||
private async Task<StructuredCvProfile?> TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||
{
|
||
var structuredJson = await _aiService.SummarizeSectionAsync(
|
||
"Extract this CV into structured JSON. Return only valid JSON with this exact top-level shape: { \"version\": \"1\", \"contact\": { \"fullName\": string|null, \"headline\": string|null, \"email\": string|null, \"phone\": string|null, \"location\": string|null, \"website\": string|null, \"linkedin\": string|null }, \"summary\": string[], \"jobs\": [{ \"title\": string|null, \"company\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"isCurrent\": boolean, \"bullets\": string[], \"skills\": string[] }], \"education\": [{ \"qualification\": string|null, \"qualificationLevel\": \"Secondary\"|\"Diploma/Certificate\"|\"Bachelor\"|\"Master\"|\"PhD\"|\"Other\"|null, \"institution\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"details\": string[] }], \"certifications\": [{ \"name\": string|null, \"issuer\": string|null, \"location\": string|null, \"date\": string|null, \"details\": string[] }], \"projects\": [{ \"name\": string|null, \"role\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"bullets\": string[], \"skills\": string[] }], \"skills\": string[], \"languages\": [{ \"name\": string|null, \"level\": string|null, \"notes\": string|null }], \"interests\": string[], \"otherSections\": [{ \"title\": string|null, \"items\": string[] }] }. Preserve facts only. Do not invent anything. If a field is unknown, use null or an empty array. Keep wording close to the source. Profile location should only be the candidate's current/home location. Education location must be the institution location. Work location must be employer/job location. Never place skill lists such as Python or Ruby into location fields. Preserve the original qualification text in education. Set qualificationLevel to the normalized enum when you can infer it, otherwise null. Put unmatched content in otherSections.",
|
||
text,
|
||
3200,
|
||
900);
|
||
|
||
if (string.IsNullOrWhiteSpace(structuredJson)) return null;
|
||
var extracted = ExtractJsonObject(structuredJson);
|
||
if (string.IsNullOrWhiteSpace(extracted)) return null;
|
||
|
||
var parsed = StructuredCvProfileJson.Deserialize(extracted);
|
||
if (!IsMeaningfullyStructured(parsed)) return null;
|
||
|
||
AnnotateStructuredCv(parsed, "llm", 0.82);
|
||
return parsed;
|
||
}
|
||
|
||
private static bool IsMeaningfullyStructured(StructuredCvProfile profile)
|
||
{
|
||
return !string.IsNullOrWhiteSpace(profile.Contact.FullName)
|
||
|| profile.Summary.Count > 0
|
||
|| profile.Jobs.Count > 0
|
||
|| profile.Education.Count > 0
|
||
|| profile.Skills.Count > 0
|
||
|| profile.Languages.Count > 0
|
||
|| profile.Interests.Count > 0
|
||
|| profile.OtherSections.Count > 0;
|
||
}
|
||
|
||
private static string? ExtractJsonObject(string raw)
|
||
{
|
||
var trimmed = raw.Trim();
|
||
if (trimmed.StartsWith("```", StringComparison.Ordinal))
|
||
{
|
||
trimmed = Regex.Replace(trimmed, "^```(?:json)?\\s*|\\s*```$", string.Empty, RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
var start = trimmed.IndexOf('{');
|
||
var end = trimmed.LastIndexOf('}');
|
||
if (start < 0 || end <= start) return null;
|
||
return trimmed[start..(end + 1)];
|
||
}
|
||
|
||
private static string? GuessFullName(string source)
|
||
{
|
||
var normalized = source.Replace("\r\n", "\n");
|
||
foreach (var line in normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(6))
|
||
{
|
||
var cleaned = line.Trim().TrimStart('#').Trim();
|
||
cleaned = Regex.Replace(cleaned, @"(?<=[a-z])(?=[A-Z])", " ");
|
||
if (cleaned.Length < 4 || cleaned.Length > 80) continue;
|
||
if (cleaned.Contains('@') || Regex.IsMatch(cleaned, @"\d")) continue;
|
||
|
||
var nameMatch = Regex.Match(cleaned, @"^(?<name>[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,3})(?:\s+(?:Real Estate Agent|Store Manager|Web Developer|Developer|Engineer|Consultant|Specialist|Analyst).*)?$", RegexOptions.IgnoreCase);
|
||
if (nameMatch.Success)
|
||
{
|
||
return nameMatch.Groups["name"].Value.Trim();
|
||
}
|
||
|
||
if (!Regex.IsMatch(cleaned, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,4}$")) continue;
|
||
return cleaned;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
private static string? GuessFullNameFromEmail(string? email)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
|
||
var localPart = email[..email.IndexOf('@')].Trim();
|
||
if (string.IsNullOrWhiteSpace(localPart)) return null;
|
||
var parts = Regex.Split(localPart, @"[._-]+")
|
||
.Select(part => part.Trim())
|
||
.Where(part => part.Length > 0)
|
||
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
|
||
.ToList();
|
||
return parts.Count >= 2 ? string.Join(" ", parts) : null;
|
||
}
|
||
|
||
private static string NormalizeTextForStructuredParsing(string source)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(source)) return string.Empty;
|
||
|
||
var text = source.Replace("\r\n", "\n").Trim();
|
||
if (!LooksLikeFlattenedCvExtraction(text)) return text;
|
||
|
||
text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match =>
|
||
{
|
||
var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty);
|
||
foreach (var alias in SectionAliases)
|
||
{
|
||
var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty);
|
||
if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
return $"\n\n## {alias.Value}\n";
|
||
}
|
||
}
|
||
|
||
return match.Value;
|
||
});
|
||
|
||
foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length))
|
||
{
|
||
text = Regex.Replace(
|
||
text,
|
||
$@"(?<!#)\b{Regex.Escape(alias.Key)}\b",
|
||
$"\n\n## {alias.Value}\n",
|
||
RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
text = Regex.Replace(text, @"\s+\+\s+", "\n+ ");
|
||
text = Regex.Replace(text, @"\s*([•●▪◦])\s*", "\n- ");
|
||
text = Regex.Replace(text, @"\s+(\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))\b", "\n$1\n", RegexOptions.IgnoreCase);
|
||
text = Regex.Replace(text, @"\n{3,}", "\n\n");
|
||
|
||
return text.Trim();
|
||
}
|
||
|
||
private static StructuredCvProfile BuildHeuristicStructuredCv(string parseSource, string rawSource)
|
||
{
|
||
var profile = new StructuredCvProfile();
|
||
var normalized = parseSource.Replace("\r\n", "\n").Trim();
|
||
|
||
profile.Contact.Email = NullIfWhitespace(Regex.Match(rawSource, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value);
|
||
profile.Contact.Phone = NormalizeDetectedPhone(Regex.Match(rawSource, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value);
|
||
profile.Contact.Website = ExtractPreferredWebsite(rawSource, profile.Contact.Email);
|
||
profile.Contact.LinkedIn = NullIfWhitespace(Regex.Match(rawSource, @"(?:linkedin(?:\.com)?/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]+)", RegexOptions.IgnoreCase).Value);
|
||
profile.Contact.FullName = GuessFullName(rawSource) ?? GuessFullNameFromEmail(profile.Contact.Email);
|
||
|
||
var sections = ParseSections(normalized);
|
||
var contactSection = sections.FirstOrDefault(section => section.Name == "Contact");
|
||
if (!string.IsNullOrWhiteSpace(contactSection.Content))
|
||
{
|
||
var contactFallback = StructuredCvProfileJson.FromSections(new[] { new StructuredCvSection { Name = "Contact", Content = contactSection.Content } });
|
||
profile.Contact.Location = PreferDetectedLocation(contactSection.Content, contactFallback.Contact.Location, profile.Contact.FullName);
|
||
profile.Contact.Headline ??= CleanHeadline(contactFallback.Contact.Headline, profile.Contact.FullName);
|
||
}
|
||
else
|
||
{
|
||
profile.Contact.Location = PreferDetectedLocation(rawSource, NullIfWhitespace(Regex.Match(rawSource, @"\b[A-Z][a-z]+(?:[\s-][A-Z][a-z]+)*(?:,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*){1,2}\b").Value), profile.Contact.FullName);
|
||
}
|
||
|
||
if (string.IsNullOrWhiteSpace(profile.Contact.Location))
|
||
{
|
||
var firstTenLines = normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(10).ToList();
|
||
profile.Contact.Location = firstTenLines.FirstOrDefault(line =>
|
||
!line.Contains('@')
|
||
&& !Regex.IsMatch(line, @"https?://|www\.", RegexOptions.IgnoreCase)
|
||
&& Regex.IsMatch(line, @"^[A-Z][A-Za-z.' -]+(?:,\s*[A-Z][A-Za-z.' -]+)?$")
|
||
&& !line.Contains("Skills", StringComparison.OrdinalIgnoreCase)
|
||
&& !line.Contains("Summary", StringComparison.OrdinalIgnoreCase)
|
||
&& !line.Contains("Developer", StringComparison.OrdinalIgnoreCase)
|
||
&& !line.Contains("Agent", StringComparison.OrdinalIgnoreCase)
|
||
&& !string.Equals(line, profile.Contact.FullName, StringComparison.OrdinalIgnoreCase));
|
||
}
|
||
|
||
if (!string.IsNullOrWhiteSpace(profile.Contact.Location))
|
||
{
|
||
profile.Contact.Location = Regex.Replace(profile.Contact.Location, @"\bSkills\b.*$", string.Empty, RegexOptions.IgnoreCase).Trim(' ', ',');
|
||
}
|
||
|
||
var summarySection = sections.FirstOrDefault(section => section.Name == "Professional Summary");
|
||
var flattenedSummary = Regex.Match(
|
||
rawSource,
|
||
@"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?<body>.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))",
|
||
RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||
if (flattenedSummary.Success)
|
||
{
|
||
profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5)
|
||
.Where(item => !Regex.IsMatch(item, @"^:?\s*https?://", RegexOptions.IgnoreCase))
|
||
.ToList();
|
||
}
|
||
else if (!string.IsNullOrWhiteSpace(summarySection.Content))
|
||
{
|
||
profile.Summary = SplitSentences(summarySection.Content, 5)
|
||
.Where(item => !Regex.IsMatch(item, @"^:?\s*https?://", RegexOptions.IgnoreCase))
|
||
.ToList();
|
||
}
|
||
|
||
var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests");
|
||
if (!string.IsNullOrWhiteSpace(interestsSection.Content))
|
||
{
|
||
profile.Interests = SplitListLike(interestsSection.Content);
|
||
}
|
||
else
|
||
{
|
||
var flattenedInterests = Regex.Match(
|
||
rawSource,
|
||
@"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?<body>.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))",
|
||
RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||
if (flattenedInterests.Success)
|
||
{
|
||
profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4);
|
||
}
|
||
}
|
||
|
||
var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages");
|
||
if (!string.IsNullOrWhiteSpace(languagesSection.Content))
|
||
{
|
||
profile.Languages = ParseLanguagesHeuristically(languagesSection.Content);
|
||
}
|
||
else
|
||
{
|
||
profile.Languages = ParseLanguagesHeuristically(rawSource);
|
||
}
|
||
|
||
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||
foreach (var skill in ExtractSkillsHeuristically(rawSource))
|
||
{
|
||
skills.Add(skill);
|
||
}
|
||
profile.Skills = skills.ToList();
|
||
|
||
var educationSection = sections.FirstOrDefault(section => section.Name == "Education");
|
||
if (!string.IsNullOrWhiteSpace(educationSection.Content))
|
||
{
|
||
profile.Education = ParseEducationHeuristically(educationSection.Content);
|
||
}
|
||
|
||
var certificationsSection = sections.FirstOrDefault(section => section.Name == "Certifications");
|
||
if (!string.IsNullOrWhiteSpace(certificationsSection.Content))
|
||
{
|
||
profile.Certifications = StructuredCvProfileJson.FromSections(new[] { new StructuredCvSection { Name = "Certifications", Content = certificationsSection.Content } }).Certifications;
|
||
}
|
||
|
||
var projectsSection = sections.FirstOrDefault(section => section.Name == "Projects");
|
||
if (!string.IsNullOrWhiteSpace(projectsSection.Content))
|
||
{
|
||
profile.Projects = StructuredCvProfileJson.FromSections(new[] { new StructuredCvSection { Name = "Projects", Content = projectsSection.Content } }).Projects;
|
||
}
|
||
|
||
var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience");
|
||
if (!string.IsNullOrWhiteSpace(experienceSection.Content))
|
||
{
|
||
profile.Jobs = ParseJobsHeuristically(experienceSection.Content);
|
||
}
|
||
else if (profile.Jobs.Count == 0)
|
||
{
|
||
profile.Jobs = ParseJobsHeuristically(normalized);
|
||
}
|
||
|
||
if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General"))
|
||
{
|
||
var general = sections.First(section => section.Name == "General");
|
||
if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0)
|
||
{
|
||
profile.Summary = SplitSentences(general.Content, 3);
|
||
}
|
||
}
|
||
|
||
return StructuredCvProfileJson.Normalize(profile);
|
||
}
|
||
|
||
private static List<string> SplitSentences(string content, int limit)
|
||
{
|
||
return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+")
|
||
.Select(value => value.Trim())
|
||
.Where(value => value.Length > 20)
|
||
.Take(limit)
|
||
.ToList();
|
||
}
|
||
|
||
private static readonly string[] ConservativeSkillHints =
|
||
{
|
||
"C#", ".NET", "ASP.NET", "SQL", "JavaScript", "TypeScript", "Python", "Ruby on Rails", "Ruby", "React", "Azure", "Azure DevOps", "GitHub", "CI/CD", "HTML5", "CSS", "MySQL", "PHP OOP", "Project management", "Revenue generation", "Business development", "Effective marketing", "Organisational capacity", "Operability and commitment", "Attention to Detail", "Property Valuation", "Retail Market Analysis", "Client Relationship Management", "Digital Marketing"
|
||
};
|
||
|
||
private static List<string> SplitListLike(string content)
|
||
{
|
||
return content
|
||
.Replace("\r\n", "\n")
|
||
.Split(new[] { '\n', ',', ';', '•', '●' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
|
||
.SelectMany(item => item.Contains(" ", StringComparison.Ordinal) ? Regex.Split(item, @"\s{2,}") : new[] { item })
|
||
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
|
||
.Where(item => item.Length > 1)
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
private static IEnumerable<string> ExtractConservativeSkills(string content)
|
||
{
|
||
foreach (var skill in ConservativeSkillHints)
|
||
{
|
||
if (Regex.IsMatch(content, $@"(?<![A-Za-z0-9]){Regex.Escape(skill)}(?![A-Za-z0-9])", RegexOptions.IgnoreCase))
|
||
{
|
||
yield return skill;
|
||
}
|
||
}
|
||
}
|
||
|
||
private static List<string> ExtractSkillsFromBullets(IEnumerable<string> bullets)
|
||
{
|
||
return ExtractConservativeSkills(string.Join("\n", bullets))
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
private static IEnumerable<string> ExtractSkillsHeuristically(string content)
|
||
{
|
||
var yielded = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||
|
||
foreach (var skill in ExtractConservativeSkills(content))
|
||
{
|
||
if (yielded.Add(skill)) yield return skill;
|
||
}
|
||
|
||
var highlightsMatch = Regex.Match(content, @"(?:Highlights|Core Skills|Skills|Technical Skills|Skill Highlights|Competencies)\s*(?<body>.*?)(?=(?:Experience|Education|Languages|Interests|Projects|Certifications|$))", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||
if (highlightsMatch.Success)
|
||
{
|
||
foreach (var item in SplitListLike(highlightsMatch.Groups["body"].Value))
|
||
{
|
||
var trimmed = item.Trim();
|
||
if (trimmed.Length >= 3 && trimmed.Length <= 80 && trimmed.Count(char.IsLetter) >= 3)
|
||
{
|
||
if (yielded.Add(trimmed)) yield return trimmed;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
private static string? NormalizeDetectedPhone(string? value)
|
||
{
|
||
var trimmed = NullIfWhitespace(value);
|
||
if (trimmed is null) return null;
|
||
|
||
var digits = trimmed.Count(char.IsDigit);
|
||
if (digits < 7) return null;
|
||
|
||
var looksLikeRawCoordinates = trimmed.Contains(" -") && digits > 18 && !trimmed.Contains('+') && !trimmed.Contains('(');
|
||
if (looksLikeRawCoordinates) return null;
|
||
|
||
return trimmed;
|
||
}
|
||
|
||
private static string? NormalizeDetectedWebsite(string? value, string? email)
|
||
{
|
||
var trimmed = NullIfWhitespace(value);
|
||
if (trimmed is null) return null;
|
||
if (!trimmed.Contains('.', StringComparison.Ordinal)) return null;
|
||
if (trimmed.Contains('@')) return null;
|
||
if (trimmed.Equals("gmail.com", StringComparison.OrdinalIgnoreCase)) return null;
|
||
|
||
var candidate = trimmed.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? trimmed : $"https://{trimmed}";
|
||
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
|
||
if (string.IsNullOrWhiteSpace(uri.Host) || !uri.Host.Contains('.', StringComparison.Ordinal)) return null;
|
||
|
||
return trimmed.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? trimmed : uri.Host;
|
||
}
|
||
|
||
private static string? ExtractPreferredWebsite(string rawSource, string? email)
|
||
{
|
||
foreach (Match match in Regex.Matches(rawSource, @"\b(?:https?://)?(?:www\.)?[A-Z0-9.-]+\.[A-Z]{2,}(?:/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]*)?", RegexOptions.IgnoreCase))
|
||
{
|
||
var candidate = NormalizeDetectedWebsite(match.Value, email);
|
||
if (candidate is null) continue;
|
||
if (candidate.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) continue;
|
||
return candidate;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
private static string? PreferDetectedLocation(string source, string? fallback, string? fullName = null)
|
||
{
|
||
var normalizedFallback = NullIfWhitespace(fallback);
|
||
if (normalizedFallback is not null)
|
||
{
|
||
normalizedFallback = Regex.Replace(normalizedFallback, @",?\s*(Hobbies|Education)\b.*$", string.Empty, RegexOptions.IgnoreCase).Trim(' ', ',');
|
||
}
|
||
|
||
if (IsPlausibleLocationValue(normalizedFallback, fullName))
|
||
{
|
||
return normalizedFallback;
|
||
}
|
||
|
||
var lines = source.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||
foreach (var rawLine in lines.Take(10))
|
||
{
|
||
var line = Regex.Replace(rawLine, @",?\s*(Hobbies|Education)\b.*$", string.Empty, RegexOptions.IgnoreCase).Trim(' ', ',');
|
||
if (!IsPlausibleLocationValue(line, fullName)) continue;
|
||
return line;
|
||
}
|
||
|
||
return IsPlausibleLocationValue(normalizedFallback, fullName) ? normalizedFallback : null;
|
||
}
|
||
|
||
private static bool IsPlausibleLocationValue(string? value, string? fullName)
|
||
{
|
||
var candidate = NullIfWhitespace(value);
|
||
if (candidate is null) return false;
|
||
if (LooksLikeRoleOrHeadline(candidate)) return false;
|
||
if (!string.IsNullOrWhiteSpace(fullName))
|
||
{
|
||
if (candidate.Equals(fullName, StringComparison.OrdinalIgnoreCase)) return false;
|
||
if (candidate.StartsWith(fullName + " ", StringComparison.OrdinalIgnoreCase)) return false;
|
||
}
|
||
|
||
if (candidate.Contains("Education", StringComparison.OrdinalIgnoreCase)
|
||
|| candidate.Contains("Hobbies", StringComparison.OrdinalIgnoreCase)
|
||
|| candidate.Contains("Skills", StringComparison.OrdinalIgnoreCase)
|
||
|| candidate.Contains("Summary", StringComparison.OrdinalIgnoreCase)) return false;
|
||
if (candidate.Contains('@') || Regex.IsMatch(candidate, @"https?://|www\.", RegexOptions.IgnoreCase)) return false;
|
||
if (candidate.Count(char.IsDigit) >= 5) return false;
|
||
if (Regex.IsMatch(candidate, @"^\d+\s+.+")) return true;
|
||
|
||
var normalized = Regex.Replace(candidate, @"\s+", " ").Trim(' ', ',');
|
||
if (normalized.Length > 80) return false;
|
||
|
||
if (Regex.IsMatch(normalized, @"^[A-Z][A-Za-z.' -]+,\s*[A-Z][A-Za-z.' -]+(?:,\s*[A-Z][A-Za-z.' -]+)?$")) return true;
|
||
if (Regex.IsMatch(normalized, @"^[A-Z][A-Za-z.' -]+(?:\s+[A-Z][A-Za-z.' -]+){0,2}$") && !LooksLikeRoleOrHeadline(normalized)) return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
private static bool LooksLikeRoleOrHeadline(string value)
|
||
{
|
||
return Regex.IsMatch(value, @"\b(real estate agent|developer|engineer|manager|consultant|specialist|analyst|designer|technician|administrator|architect|director|coordinator|assistant|lead|owner|founder|recruiter|teacher|writer|producer|officer|supervisor|sales)\b", RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
private static bool LooksLikePersonName(string value)
|
||
{
|
||
return Regex.IsMatch(value, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,3}$")
|
||
&& !LooksLikeRoleOrHeadline(value);
|
||
}
|
||
|
||
private static bool ArePlausibleJobs(List<StructuredCvJob>? jobs, string? fullName)
|
||
{
|
||
if (jobs is null || jobs.Count == 0) return false;
|
||
return jobs.Any(job => IsPlausibleJob(job, fullName));
|
||
}
|
||
|
||
private static int ScoreJobs(List<StructuredCvJob>? jobs, string? fullName)
|
||
{
|
||
if (jobs is null || jobs.Count == 0) return 0;
|
||
var first = jobs[0];
|
||
var score = 0;
|
||
if (IsPlausibleJob(first, fullName)) score += 5;
|
||
if (!string.IsNullOrWhiteSpace(first.Title) && LooksLikeRoleOrHeadline(first.Title)) score += 4;
|
||
if (!string.IsNullOrWhiteSpace(first.Company)) score += 2;
|
||
if (!string.IsNullOrWhiteSpace(first.Start) || !string.IsNullOrWhiteSpace(first.End)) score += 2;
|
||
if (first.Bullets.Count > 0) score += 2;
|
||
score += Math.Min(jobs.Count, 3);
|
||
return score;
|
||
}
|
||
|
||
private static bool IsPlausibleJob(StructuredCvJob? job, string? fullName)
|
||
{
|
||
if (job is null) return false;
|
||
var title = NullIfWhitespace(job.Title);
|
||
var company = NullIfWhitespace(job.Company);
|
||
var location = NullIfWhitespace(job.Location);
|
||
var hasEvidence = !string.IsNullOrWhiteSpace(company)
|
||
|| !string.IsNullOrWhiteSpace(location)
|
||
|| !string.IsNullOrWhiteSpace(job.Start)
|
||
|| !string.IsNullOrWhiteSpace(job.End)
|
||
|| job.Bullets.Count > 0;
|
||
|
||
if (title is null) return hasEvidence;
|
||
if (!string.IsNullOrWhiteSpace(fullName) && title.Equals(fullName, StringComparison.OrdinalIgnoreCase)) return false;
|
||
if (LooksLikePersonName(title)) return false;
|
||
if (title.Contains('@') || Regex.IsMatch(title, @"https?://|www\.", RegexOptions.IgnoreCase)) return false;
|
||
if (Regex.IsMatch(title, @"^(?:\d{2}/\d{4}|\d{4})\s*(?:[-–]|to)\s*(?:\d{2}/\d{4}|\d{4}|Present|Current)$", RegexOptions.IgnoreCase)) return false;
|
||
if (!hasEvidence && !LooksLikeRoleOrHeadline(title)) return false;
|
||
return true;
|
||
}
|
||
|
||
private static string? CleanHeadline(string? value, string? fullName)
|
||
{
|
||
var trimmed = NullIfWhitespace(value);
|
||
if (trimmed is null) return null;
|
||
if (!string.IsNullOrWhiteSpace(fullName) && trimmed.Equals(fullName, StringComparison.OrdinalIgnoreCase)) return null;
|
||
if (trimmed.Contains('@') || trimmed.Count(char.IsDigit) > 3) return null;
|
||
return trimmed;
|
||
}
|
||
|
||
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
|
||
{
|
||
var languages = new List<StructuredCvLanguage>();
|
||
var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+")
|
||
.Select(item => item.Trim())
|
||
.Where(item => item.Length > 1);
|
||
|
||
foreach (var candidate in candidates)
|
||
{
|
||
var level = HumanLanguageCatalog.ExtractLevel(candidate);
|
||
if (level is null) continue;
|
||
|
||
foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate))
|
||
{
|
||
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
|
||
}
|
||
}
|
||
|
||
return languages
|
||
.GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase)
|
||
.Select(group => group.First())
|
||
.ToList();
|
||
}
|
||
|
||
private static List<StructuredCvEducation> ParseEducationHeuristically(string content)
|
||
{
|
||
var normalized = content.Replace("\r\n", "\n").Trim();
|
||
var blocks = Regex.Split(normalized, @"\n\s*\n|(?=###\s+)|(?=(?:Bachelor|Master|Doctor|Associate|Diploma|Certificate|BSc|BA|MSc|MA|PhD)\b)", RegexOptions.IgnoreCase)
|
||
.Select(block => block.Trim())
|
||
.Where(block => block.Length > 0)
|
||
.ToList();
|
||
|
||
var items = new List<StructuredCvEducation>();
|
||
foreach (var block in blocks)
|
||
{
|
||
var candidate = StructuredCvProfileJson.FromSections(new[] { new StructuredCvSection { Name = "Education", Content = block } }).Education;
|
||
if (candidate.Count > 0)
|
||
{
|
||
items.AddRange(candidate);
|
||
continue;
|
||
}
|
||
|
||
var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
||
if (lines.Count == 0) continue;
|
||
|
||
var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-–]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase);
|
||
var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' ');
|
||
var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-–]"));
|
||
if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0];
|
||
|
||
if (qualificationLine is null && institutionLine is null) continue;
|
||
items.Add(new StructuredCvEducation
|
||
{
|
||
Qualification = TitleCasePreservingAcronyms(qualificationLine),
|
||
QualificationLevel = InferQualificationLevel(qualificationLine),
|
||
Institution = TitleCasePreservingAcronyms(institutionLine),
|
||
Start = dateMatch.Success ? dateMatch.Groups[1].Value : null,
|
||
End = dateMatch.Success ? dateMatch.Groups[2].Value : null,
|
||
Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(),
|
||
});
|
||
}
|
||
|
||
return items;
|
||
}
|
||
|
||
private static List<StructuredCvJob> ParseJobsHeuristically(string content)
|
||
{
|
||
var normalized = content.Replace("\r\n", "\n").Trim();
|
||
var structured = StructuredCvProfileJson.FromSections(new[] { new StructuredCvSection { Name = "Work Experience", Content = normalized } }).Jobs;
|
||
if (ArePlausibleJobs(structured, null))
|
||
{
|
||
return structured;
|
||
}
|
||
|
||
var simpleLines = normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||
var inlineDateIndex = Array.FindIndex(simpleLines, line => Regex.IsMatch(line, @".+\d{2}/\d{4}\s+to\s+\d{2}/\d{4}", RegexOptions.IgnoreCase) || Regex.IsMatch(line, @".+\d{4}\s*(?:[-–]|to)\s*(?:\d{4}|Present|Current)", RegexOptions.IgnoreCase));
|
||
if (inlineDateIndex >= 0)
|
||
{
|
||
var titleLine = Regex.Replace(simpleLines[inlineDateIndex], @"\s*[-–]?\s*\d{2}/\d{4}\s+to\s+\d{2}/\d{4}.*$", string.Empty, RegexOptions.IgnoreCase);
|
||
titleLine = Regex.Replace(titleLine, @"\s*[-–]?\s*\d{4}\s*[-–]\s*(?:\d{4}|Present|Current).*$", string.Empty, RegexOptions.IgnoreCase).Trim();
|
||
var companyOrLocation = inlineDateIndex + 1 < simpleLines.Length ? simpleLines[inlineDateIndex + 1] : null;
|
||
var datesMatch = Regex.Match(simpleLines[inlineDateIndex], @"(\d{2}/\d{4}|\d{4})\s*(?:to|[-–])\s*(\d{2}/\d{4}|\d{4}|Present|Current)", RegexOptions.IgnoreCase);
|
||
var bullets = simpleLines.Skip(inlineDateIndex + 2).Where(line => line.Length > 12).ToList();
|
||
if (!string.IsNullOrWhiteSpace(titleLine))
|
||
{
|
||
return new List<StructuredCvJob>
|
||
{
|
||
new StructuredCvJob
|
||
{
|
||
Title = titleLine,
|
||
Company = companyOrLocation,
|
||
Start = datesMatch.Success ? datesMatch.Groups[1].Value : null,
|
||
End = datesMatch.Success ? datesMatch.Groups[2].Value : null,
|
||
IsCurrent = datesMatch.Success && (string.Equals(datesMatch.Groups[2].Value, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(datesMatch.Groups[2].Value, "Current", StringComparison.OrdinalIgnoreCase)),
|
||
Bullets = bullets,
|
||
Skills = ExtractSkillsFromBullets(bullets),
|
||
}
|
||
};
|
||
}
|
||
}
|
||
|
||
var dateIndex = Array.FindIndex(simpleLines, line => Regex.IsMatch(line, @"(?:\d{2}/\d{4}|\d{4})\s*(?:[-–]|to)\s*(?:\d{2}/\d{4}|\d{4}|Present|Current)", RegexOptions.IgnoreCase));
|
||
if (dateIndex >= 0)
|
||
{
|
||
if (dateIndex + 2 < simpleLines.Length && LooksLikeRoleOrHeadline(simpleLines[dateIndex + 1]))
|
||
{
|
||
var datesLine = simpleLines[dateIndex];
|
||
var titleLine = simpleLines[dateIndex + 1];
|
||
var companyLine = simpleLines[dateIndex + 2];
|
||
var bullets = SplitSentences(string.Join(" ", simpleLines.Skip(dateIndex + 3)), 6);
|
||
var parts = Regex.Split(datesLine, @"\s*[-–]\s*");
|
||
return new List<StructuredCvJob>
|
||
{
|
||
new StructuredCvJob
|
||
{
|
||
Title = titleLine,
|
||
Company = companyLine,
|
||
Start = parts.FirstOrDefault(),
|
||
End = parts.Skip(1).FirstOrDefault(),
|
||
IsCurrent = string.Equals(parts.Skip(1).FirstOrDefault(), "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(parts.Skip(1).FirstOrDefault(), "Current", StringComparison.OrdinalIgnoreCase),
|
||
Bullets = bullets,
|
||
Skills = ExtractSkillsFromBullets(bullets),
|
||
}
|
||
};
|
||
}
|
||
|
||
if (dateIndex >= 2)
|
||
{
|
||
var titleLine = simpleLines[dateIndex - 2];
|
||
var locationLine = simpleLines[dateIndex - 1];
|
||
var datesLine = simpleLines[dateIndex];
|
||
var bullets = simpleLines.Skip(dateIndex + 1).Where(line => line.Length > 12).ToList();
|
||
var parts = Regex.Split(datesLine, @"\s*[-–]\s*");
|
||
return new List<StructuredCvJob>
|
||
{
|
||
new StructuredCvJob
|
||
{
|
||
Title = titleLine,
|
||
Location = locationLine,
|
||
Start = parts.FirstOrDefault(),
|
||
End = parts.Skip(1).FirstOrDefault(),
|
||
IsCurrent = string.Equals(parts.Skip(1).FirstOrDefault(), "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(parts.Skip(1).FirstOrDefault(), "Current", StringComparison.OrdinalIgnoreCase),
|
||
Bullets = bullets,
|
||
Skills = ExtractSkillsFromBullets(bullets),
|
||
}
|
||
};
|
||
}
|
||
}
|
||
|
||
var pattern = new Regex(@"(?<title>[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline);
|
||
var jobs = new List<StructuredCvJob>();
|
||
|
||
foreach (Match match in pattern.Matches(normalized))
|
||
{
|
||
var body = match.Groups["body"].Value.Trim();
|
||
var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value);
|
||
var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-–]\s*");
|
||
var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6);
|
||
|
||
jobs.Add(new StructuredCvJob
|
||
{
|
||
Title = TitleCasePreservingAcronyms(match.Groups["title"].Value),
|
||
Company = employer,
|
||
Start = NullIfWhitespace(dates.FirstOrDefault()),
|
||
End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()),
|
||
IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase),
|
||
Bullets = bullets,
|
||
Skills = ExtractSkillsFromBullets(bullets),
|
||
});
|
||
}
|
||
|
||
return jobs;
|
||
}
|
||
|
||
private static string? TitleCasePreservingAcronyms(string? value)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||
|
||
var words = value.Trim()
|
||
.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
||
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
|
||
? word
|
||
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())
|
||
.ToArray();
|
||
|
||
return string.Join(" ", words);
|
||
}
|
||
|
||
private static string? InferQualificationLevel(string? value)
|
||
{
|
||
var candidate = value?.Trim();
|
||
if (string.IsNullOrWhiteSpace(candidate)) return null;
|
||
if (Regex.IsMatch(candidate, @"\b(phd|doctorate|dphil)\b", RegexOptions.IgnoreCase)) return "PhD";
|
||
if (Regex.IsMatch(candidate, @"\b(master(?:'s)?|msc|m\.sc|ma|m\.a|mba|meng)\b", RegexOptions.IgnoreCase)) return "Master";
|
||
if (Regex.IsMatch(candidate, @"\b(bachelor(?:'s)?|bsc|b\.sc|ba|b\.a|beng|degree)\b", RegexOptions.IgnoreCase)) return "Bachelor";
|
||
if (Regex.IsMatch(candidate, @"\b(diploma|certificate|certification|nvq|btec|level\s*\d+|apprenticeship|associate)\b", RegexOptions.IgnoreCase)) return "Diploma/Certificate";
|
||
if (Regex.IsMatch(candidate, @"\b(gcse|a-?level|secondary|high school)\b", RegexOptions.IgnoreCase)) return "Secondary";
|
||
return "Other";
|
||
}
|
||
|
||
private static int CountWords(string? text)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(text)) return 0;
|
||
return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
|
||
}
|
||
|
||
private static string? NullIfWhitespace(string? value)
|
||
{
|
||
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
|
||
}
|
||
|
||
private static List<(string Name, string Content)> ParseSections(string source)
|
||
{
|
||
var lines = source.Replace("\r\n", "\n").Split('\n');
|
||
var sections = new List<(string Name, List<string> Lines)>();
|
||
var currentName = "General";
|
||
var currentLines = new List<string>();
|
||
|
||
void Flush()
|
||
{
|
||
var content = string.Join("\n", currentLines).Trim();
|
||
if (!string.IsNullOrWhiteSpace(content))
|
||
{
|
||
sections.Add((currentName, new List<string>(currentLines)));
|
||
}
|
||
currentLines.Clear();
|
||
}
|
||
|
||
foreach (var raw in lines)
|
||
{
|
||
var line = raw.Trim();
|
||
var canonicalHeading = CanonicalizeSectionHeading(line);
|
||
if (canonicalHeading is not null)
|
||
{
|
||
Flush();
|
||
currentName = canonicalHeading;
|
||
continue;
|
||
}
|
||
|
||
currentLines.Add(raw);
|
||
}
|
||
|
||
Flush();
|
||
|
||
if (sections.Count == 0)
|
||
{
|
||
return new List<(string Name, string Content)> { ("General", source.Trim()) };
|
||
}
|
||
|
||
return sections
|
||
.Select(section => (section.Name, string.Join("\n", section.Lines).Trim()))
|
||
.Where(section => !string.IsNullOrWhiteSpace(section.Item2))
|
||
.ToList();
|
||
}
|
||
|
||
private static List<StructuredCvSection> BuildSectionsFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks)
|
||
{
|
||
var sectionBuckets = new List<StructuredCvSection>();
|
||
foreach (var block in classifiedBlocks)
|
||
{
|
||
var existing = sectionBuckets.FirstOrDefault(section => section.Name == block.SectionName);
|
||
if (existing is null)
|
||
{
|
||
sectionBuckets.Add(new StructuredCvSection { Name = block.SectionName, Content = block.Content, WordCount = CountWords(block.Content) });
|
||
}
|
||
else
|
||
{
|
||
existing.Content = $"{existing.Content}\n\n{block.Content}".Trim();
|
||
existing.WordCount = CountWords(existing.Content);
|
||
}
|
||
}
|
||
|
||
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
|
||
}
|
||
|
||
private static StructuredCvProfile BuildStructuredCvFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks)
|
||
{
|
||
var profile = new StructuredCvProfile();
|
||
var now = DateTimeOffset.UtcNow;
|
||
var summary = new List<string>();
|
||
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||
|
||
foreach (var block in classifiedBlocks)
|
||
{
|
||
switch (block.SectionName)
|
||
{
|
||
case "Professional Summary":
|
||
foreach (var item in (block.Classification?.Summary is { Count: > 0 }
|
||
? block.Classification.Summary
|
||
: SplitClassifierContent(block.Content, 5)))
|
||
{
|
||
summary.Add(item);
|
||
}
|
||
ApplyClassifierFieldMetadata(profile, "summary", summary.FirstOrDefault(), block, now);
|
||
break;
|
||
case "Skills":
|
||
foreach (var item in (block.Classification?.Skills is { Count: > 0 }
|
||
? block.Classification.Skills.Where(skill => !string.IsNullOrWhiteSpace(skill)).Select(skill => skill.Trim())
|
||
: SplitClassifierSkills(block.Content)))
|
||
{
|
||
skills.Add(item);
|
||
}
|
||
ApplyClassifierFieldMetadata(profile, "skills", skills.FirstOrDefault(), block, now);
|
||
break;
|
||
case "Work Experience":
|
||
var job = BuildJobFromClassifiedBlock(block);
|
||
if (job is not null)
|
||
{
|
||
var index = profile.Jobs.Count;
|
||
profile.Jobs.Add(job);
|
||
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].title", job.Title, block, now);
|
||
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].company", job.Company, block, now);
|
||
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].location", job.Location, block, now);
|
||
}
|
||
break;
|
||
case "Education":
|
||
var education = BuildEducationFromClassifiedBlock(block);
|
||
if (education is not null)
|
||
{
|
||
var index = profile.Education.Count;
|
||
profile.Education.Add(education);
|
||
ApplyClassifierFieldMetadata(profile, $"education[{index}].qualification", education.Qualification, block, now);
|
||
ApplyClassifierFieldMetadata(profile, $"education[{index}].institution", education.Institution, block, now);
|
||
}
|
||
break;
|
||
default:
|
||
if (!string.IsNullOrWhiteSpace(block.Content))
|
||
{
|
||
profile.OtherSections.Add(new StructuredCvOtherSection
|
||
{
|
||
Title = block.SectionName,
|
||
Items = SplitClassifierContent(block.Content, 6)
|
||
});
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
|
||
profile.Summary = summary.Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||
profile.Skills = skills.ToList();
|
||
profile.Sections = BuildSectionsFromClassifiedBlocks(classifiedBlocks);
|
||
|
||
var averageConfidence = classifiedBlocks
|
||
.Select(block => block.Classification?.Confidence)
|
||
.Where(value => value.HasValue)
|
||
.Select(value => value!.Value)
|
||
.DefaultIfEmpty(0.74)
|
||
.Average();
|
||
AnnotateStructuredCv(profile, "classifier", averageConfidence);
|
||
return StructuredCvProfileJson.Normalize(profile);
|
||
}
|
||
|
||
private static StructuredCvJob? BuildJobFromClassifiedBlock(ClassifiedCvBlock block)
|
||
{
|
||
var classification = block.Classification;
|
||
if (classification is null) return null;
|
||
|
||
var bullets = classification.Bullets is { Count: > 0 }
|
||
? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList()
|
||
: SplitClassifierContent(block.OriginalBlock, 6);
|
||
|
||
var job = new StructuredCvJob
|
||
{
|
||
Title = NullIfWhitespace(classification.Title),
|
||
Company = NullIfWhitespace(classification.Company),
|
||
Location = NullIfWhitespace(classification.Location),
|
||
Start = NullIfWhitespace(classification.Start),
|
||
End = NullIfWhitespace(classification.End),
|
||
IsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase),
|
||
Bullets = bullets,
|
||
Skills = classification.Skills is { Count: > 0 }
|
||
? classification.Skills.Where(skill => !string.IsNullOrWhiteSpace(skill)).Select(skill => skill.Trim()).ToList()
|
||
: SplitClassifierSkills(block.OriginalBlock)
|
||
};
|
||
|
||
return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Jobs = new List<StructuredCvJob> { job } }).Jobs.FirstOrDefault();
|
||
}
|
||
|
||
private static StructuredCvEducation? BuildEducationFromClassifiedBlock(ClassifiedCvBlock block)
|
||
{
|
||
var classification = block.Classification;
|
||
if (classification is null) return null;
|
||
|
||
var education = new StructuredCvEducation
|
||
{
|
||
Qualification = NullIfWhitespace(classification.Title),
|
||
Institution = NullIfWhitespace(classification.Company),
|
||
Location = NullIfWhitespace(classification.Location),
|
||
Start = NullIfWhitespace(classification.Start),
|
||
End = NullIfWhitespace(classification.End),
|
||
Details = classification.Bullets is { Count: > 0 }
|
||
? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList()
|
||
: SplitClassifierContent(block.OriginalBlock, 5)
|
||
};
|
||
|
||
return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Education = new List<StructuredCvEducation> { education } }).Education.FirstOrDefault();
|
||
}
|
||
|
||
private static List<string> SplitClassifierContent(string content, int limit)
|
||
{
|
||
return content
|
||
.Replace("\r\n", "\n")
|
||
.Split(new[] { '\n', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
|
||
.SelectMany(line => line.Contains(". ", StringComparison.Ordinal)
|
||
? Regex.Split(line, @"(?<=[.!?])\s+")
|
||
: new[] { line })
|
||
.Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' '))
|
||
.Where(item => item.Length > 2)
|
||
.Take(limit)
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
private static List<string> SplitClassifierSkills(string content)
|
||
{
|
||
return content
|
||
.Replace("\r\n", "\n")
|
||
.Split(new[] { '\n', ',', ';', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
|
||
.Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' '))
|
||
.Where(item => item.Length > 1 && item.Length <= 48 && !LooksLikeDateLikeValue(item) && !item.Contains('@'))
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
private static bool LooksLikeDateLikeValue(string value)
|
||
{
|
||
return Regex.IsMatch(value, @"^(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
private static void ApplyClassifierFieldMetadata(StructuredCvProfile profile, string key, string? value, ClassifiedCvBlock block, DateTimeOffset now)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(value)) return;
|
||
|
||
profile.Metadata.Fields[key] = new StructuredCvFieldMetadata
|
||
{
|
||
Confidence = block.Classification?.Confidence ?? 0.74,
|
||
Method = "classifier",
|
||
SourceSnippet = block.OriginalBlock.Length > 180 ? block.OriginalBlock[..180] : block.OriginalBlock,
|
||
SourceBlockId = $"block-{block.Index}",
|
||
ReviewState = string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase) ? "needs-review" : "suggested",
|
||
LastUpdatedAtUtc = now,
|
||
};
|
||
}
|
||
|
||
private async Task<List<ClassifiedCvBlock>> ClassifyBlocksAsync(string parseSource, CancellationToken cancellationToken)
|
||
{
|
||
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
|
||
.Select(block => block.Trim())
|
||
.Where(block => block.Length >= 24)
|
||
.ToList();
|
||
|
||
if (blocks.Count == 0) return new List<ClassifiedCvBlock>();
|
||
|
||
var results = new List<ClassifiedCvBlock>();
|
||
for (var index = 0; index < blocks.Count; index++)
|
||
{
|
||
var block = blocks[index];
|
||
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
|
||
var sectionName = classification?.Section;
|
||
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
|
||
{
|
||
sectionName = canonical;
|
||
}
|
||
|
||
if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
sectionName = "General";
|
||
}
|
||
|
||
var content = block;
|
||
if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null)
|
||
{
|
||
var lines = new List<string>();
|
||
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
|
||
var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase);
|
||
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent);
|
||
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
|
||
if (classification.Bullets is not null)
|
||
{
|
||
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
|
||
}
|
||
if (lines.Count > 0) content = string.Join("\n", lines);
|
||
}
|
||
else if (string.Equals(sectionName, "Education", StringComparison.OrdinalIgnoreCase) && classification is not null)
|
||
{
|
||
var lines = new List<string>();
|
||
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
|
||
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, false);
|
||
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
|
||
if (classification.Bullets is not null)
|
||
{
|
||
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
|
||
}
|
||
if (lines.Count > 0) content = string.Join("\n", lines);
|
||
}
|
||
else if (string.Equals(sectionName, "Skills", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
var items = classification?.Skills is { Count: > 0 }
|
||
? classification.Skills.Where(skill => !string.IsNullOrWhiteSpace(skill)).Select(skill => skill.Trim()).ToList()
|
||
: SplitClassifierSkills(block);
|
||
if (items.Count > 0) content = string.Join("\n", items);
|
||
}
|
||
else if (string.Equals(sectionName, "Professional Summary", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
var items = classification?.Summary is { Count: > 0 }
|
||
? classification.Summary.Where(line => !string.IsNullOrWhiteSpace(line)).Select(line => $"- {line.Trim()}")
|
||
: classification?.Bullets is { Count: > 0 }
|
||
? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}")
|
||
: Enumerable.Empty<string>();
|
||
var materialized = items.ToList();
|
||
if (materialized.Count > 0) content = string.Join("\n", materialized);
|
||
}
|
||
|
||
results.Add(new ClassifiedCvBlock(index + 1, block, sectionName, content, classification));
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
|
||
if (string.IsNullOrWhiteSpace(start)) return end;
|
||
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
|
||
}
|
||
|
||
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||
{
|
||
var normalized = text.Trim();
|
||
var forceAiNormalizer = string.Equals(Environment.GetEnvironmentVariable("CV_FORCE_AI_NORMALIZER"), "true", StringComparison.OrdinalIgnoreCase);
|
||
if (forceAiNormalizer)
|
||
{
|
||
var forced = await _cvAiNormalizer.NormalizeAsync(normalized, cancellationToken);
|
||
if (!string.IsNullOrWhiteSpace(forced?.NormalizedText))
|
||
{
|
||
return forced.NormalizedText.Trim();
|
||
}
|
||
}
|
||
|
||
var looksFlattened = LooksLikeFlattenedCvExtraction(normalized);
|
||
var hasRecoverableSignals = HasRecoverableSectionSignals(normalized);
|
||
|
||
if (!looksFlattened && hasRecoverableSignals)
|
||
{
|
||
return normalized;
|
||
}
|
||
|
||
var reconstructed = await _aiService.SummarizeSectionAsync(
|
||
"Reconstruct this CV text extracted from a PDF into a clean, readable master CV in markdown. Preserve facts only. Recover clear sections such as Contact, Professional Summary, Work Experience, Education, Skills, Languages, and Interests when present. Split contact details onto their own lines, turn noisy all-caps/spaced headings into normal headings, keep dates with the correct roles and employers, and remove layout/OCR artifacts. Do not invent employers, titles, dates, or metrics. Return only the reconstructed CV text.",
|
||
normalized,
|
||
2800,
|
||
900);
|
||
|
||
var candidate = string.IsNullOrWhiteSpace(reconstructed) ? normalized : reconstructed.Trim();
|
||
if (LooksLikeFlattenedCvExtraction(candidate) || !HasRecoverableSectionSignals(candidate))
|
||
{
|
||
var aiNormalized = await _cvAiNormalizer.NormalizeAsync(normalized, cancellationToken);
|
||
if (!string.IsNullOrWhiteSpace(aiNormalized?.NormalizedText))
|
||
{
|
||
return aiNormalized.NormalizedText.Trim();
|
||
}
|
||
}
|
||
|
||
return candidate;
|
||
}
|
||
|
||
private static bool LooksLikeFlattenedCvExtraction(string text)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(text)) return false;
|
||
|
||
var normalized = text.Replace("\r\n", "\n");
|
||
var lineCount = normalized.Split('\n').Count(line => !string.IsNullOrWhiteSpace(line));
|
||
var spacedHeadingCount = Regex.Matches(normalized, @"\b(?:[A-Z]\s){3,}[A-Z]\b").Count;
|
||
var knownHeadingHits = SectionAliases.Keys.Count(alias => normalized.Contains(alias, StringComparison.OrdinalIgnoreCase));
|
||
var bulletCount = Regex.Matches(normalized, @"[•●▪◦]").Count;
|
||
|
||
return (lineCount <= 6 && normalized.Length >= 500)
|
||
|| spacedHeadingCount >= 3
|
||
|| (knownHeadingHits >= 3 && lineCount <= 12)
|
||
|| (normalized.Contains(" + ") && bulletCount > 0 && lineCount <= 10);
|
||
}
|
||
|
||
private static bool LooksLikeNormalizedMarkdownCv(string text)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(text)) return false;
|
||
return Regex.IsMatch(text, @"(?im)^#\s+(Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests)\s*$");
|
||
}
|
||
|
||
private static StructuredCvProfile BuildStructuredCvFromNormalizedMarkdown(string text)
|
||
{
|
||
var sections = ParseSections(text)
|
||
.Select(section => new StructuredCvSection
|
||
{
|
||
Name = section.Name,
|
||
Content = section.Content,
|
||
WordCount = CountWords(section.Content),
|
||
})
|
||
.ToList();
|
||
|
||
var profile = StructuredCvProfileJson.FromSections(sections);
|
||
profile.Sections = sections;
|
||
|
||
if (string.IsNullOrWhiteSpace(profile.Contact.FullName))
|
||
{
|
||
profile.Contact.FullName = GuessFullName(text) ?? GuessFullNameFromEmail(profile.Contact.Email);
|
||
}
|
||
|
||
var contactSection = sections.FirstOrDefault(section => section.Name == "Contact");
|
||
profile.Contact.Location = PreferDetectedLocation(contactSection?.Content ?? text, profile.Contact.Location, profile.Contact.FullName);
|
||
profile.Summary = CondenseSummary(profile.Summary);
|
||
profile.Skills = OrderSkills(profile.Skills);
|
||
profile.Interests = CleanInterestItems(profile.Interests);
|
||
|
||
foreach (var job in profile.Jobs)
|
||
{
|
||
job.Bullets = job.Bullets.Where(bullet => !bullet.Contains("Detail not specified", StringComparison.OrdinalIgnoreCase)).ToList();
|
||
}
|
||
|
||
foreach (var education in profile.Education)
|
||
{
|
||
education.Details = education.Details.Where(detail => !detail.Contains("Detail not specified", StringComparison.OrdinalIgnoreCase)).ToList();
|
||
}
|
||
|
||
return profile;
|
||
}
|
||
|
||
private static List<string> CondenseSummary(List<string> summary)
|
||
{
|
||
if (summary.Count <= 1) return summary;
|
||
var joined = string.Join(" ", summary).Trim();
|
||
return string.IsNullOrWhiteSpace(joined) ? new List<string>() : new List<string> { joined };
|
||
}
|
||
|
||
private static List<string> OrderSkills(List<string> skills)
|
||
{
|
||
return skills
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.OrderBy(skill => skill, StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
private static List<string> CleanInterestItems(List<string> interests)
|
||
{
|
||
return interests
|
||
.Where(item => !item.Contains("linkedin", StringComparison.OrdinalIgnoreCase)
|
||
&& !item.Contains("realtor", StringComparison.OrdinalIgnoreCase)
|
||
&& !Regex.IsMatch(item, @"https?://|www\.", RegexOptions.IgnoreCase))
|
||
.ToList();
|
||
}
|
||
|
||
private static string? CanonicalizeSectionHeading(string line)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(line)) return null;
|
||
|
||
var normalized = line.Trim();
|
||
if (normalized.StartsWith("#", StringComparison.Ordinal))
|
||
{
|
||
normalized = normalized.TrimStart('#').Trim();
|
||
}
|
||
|
||
normalized = normalized.TrimEnd(':').Trim();
|
||
if (normalized.Length == 0 || normalized.Length > 60) return null;
|
||
if (normalized.Contains('.') || normalized.Contains(" ")) return null;
|
||
|
||
return SectionAliases.TryGetValue(normalized, out var canonical) ? canonical : null;
|
||
}
|
||
|
||
private static bool HasRecoverableSectionSignals(string text)
|
||
{
|
||
var sections = ParseSections(text);
|
||
return sections.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase))
|
||
|| Regex.IsMatch(text, @"(?im)^\s*(Contact|Professional Summary|Summary|Work Experience|Experience|Education|Skills|Languages|Interests)\s*:?")
|
||
|| Regex.IsMatch(text, @"(?im)^\s*#\s*(Contact|Professional Summary|Summary|Work Experience|Experience|Education|Skills|Languages|Interests)");
|
||
}
|
||
|
||
private static async Task<string> ExtractTextAsync(IFormFile file, string extension)
|
||
{
|
||
if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
using var stream = file.OpenReadStream();
|
||
using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true);
|
||
return (await reader.ReadToEndAsync()).Trim();
|
||
}
|
||
|
||
await using var memory = new MemoryStream();
|
||
await file.CopyToAsync(memory);
|
||
var bytes = memory.ToArray();
|
||
|
||
if (string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
var raw = Encoding.UTF8.GetString(bytes);
|
||
var textMatches = Regex.Matches(raw, @"\((.*?)\)Tj", RegexOptions.Singleline)
|
||
.Select(match => match.Groups[1].Value)
|
||
.Concat(Regex.Matches(raw, @"\[(.*?)\]TJ", RegexOptions.Singleline)
|
||
.SelectMany(match => Regex.Matches(match.Groups[1].Value, @"\((.*?)\)", RegexOptions.Singleline).Select(x => x.Groups[1].Value)))
|
||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||
.Select(value => Regex.Unescape(value))
|
||
.ToList();
|
||
|
||
var joined = textMatches.Count > 0 ? string.Join(" ", textMatches) : raw;
|
||
var scrubbed = Regex.Replace(joined, @"[\x00-\x08\x0B\x0C\x0E-\x1F]", " ");
|
||
return Regex.Replace(scrubbed, @"\s+", " ").Trim();
|
||
}
|
||
|
||
if (string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
using var archive = new System.IO.Compression.ZipArchive(new MemoryStream(bytes), System.IO.Compression.ZipArchiveMode.Read, leaveOpen: false);
|
||
var entry = archive.GetEntry("word/document.xml");
|
||
if (entry is null) return string.Empty;
|
||
using var entryStream = entry.Open();
|
||
using var reader = new StreamReader(entryStream, Encoding.UTF8);
|
||
var xml = await reader.ReadToEndAsync();
|
||
var withoutTags = Regex.Replace(xml, "<[^>]+>", " ");
|
||
var decoded = System.Net.WebUtility.HtmlDecode(withoutTags) ?? string.Empty;
|
||
return Regex.Replace(decoded, @"\s+", " ").Trim();
|
||
}
|
||
|
||
return string.Empty;
|
||
}
|
||
}
|