using System.Security.Cryptography; using System.Text; using System.Text.Json; using System.Text.RegularExpressions; using JobTrackerApi.Data; using JobTrackerApi.Services; using JobTrackerApi.Models; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Identity; using Microsoft.AspNetCore.Mvc; using Microsoft.EntityFrameworkCore; namespace JobTrackerApi.Controllers; [ApiController] [Route("api/profile-cv")] [Authorize(AuthenticationSchemes = "local")] public sealed class ProfileCvController : ControllerBase { private static readonly HashSet AllowedExtensions = new(StringComparer.OrdinalIgnoreCase) { ".txt", ".md", ".pdf", ".docx", ".png", ".jpg", ".jpeg", ".webp", }; private static readonly Dictionary SectionAliases = new(StringComparer.OrdinalIgnoreCase) { ["professional summary"] = "Professional Summary", ["summary"] = "Professional Summary", ["profile"] = "Professional Summary", ["about me"] = "Professional Summary", ["contact"] = "Contact", ["contact details"] = "Contact", ["core skills"] = "Skills", ["skills"] = "Skills", ["technical skills"] = "Skills", ["experience"] = "Work Experience", ["experience highlights"] = "Work Experience", ["work experience"] = "Work Experience", ["employment history"] = "Work Experience", ["selected achievements"] = "Selected Achievements", ["achievements"] = "Selected Achievements", ["projects"] = "Projects", ["education"] = "Education", ["certifications"] = "Certifications", ["certificates"] = "Certifications", ["languages"] = "Languages", ["interests"] = "Interests", }; private const long MaxFileSizeBytes = 5 * 1024 * 1024; private const string ParserVersion = "m005-s01"; private const string NormalizerVersion = "m005-s01"; private const string LlmPromptVersion = "m005-s01"; private readonly UserManager _users; private readonly ISummarizerService _aiService; private readonly ICvAiClassifier _cvAiClassifier; private readonly JobTrackerContext _db; private readonly AppPaths _paths; public ProfileCvController(UserManager users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null) { _users = users; _aiService = aiService; _cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance; _db = db; _paths = paths; } public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole); public sealed record ParseCvRequest(string? Text); private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv); private sealed record ClassifiedCvBlock(int Index, string OriginalBlock, string SectionName, string Content, CvBlockClassificationResult? Classification); public sealed record CvExtractionRunListItem( int Id, string Trigger, string Status, string? ArtifactFileName, DateTimeOffset StartedAtUtc, DateTimeOffset? CompletedAtUtc, DateTimeOffset? AppliedAtUtc, string ParserVersion, string NormalizerVersion, string LlmPromptVersion, string? ErrorMessage); [HttpPost("upload")] [RequestSizeLimit(MaxFileSizeBytes)] public async Task Upload([FromForm] IFormFile file) { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); if (file is null || file.Length == 0) return BadRequest("Select a CV file to upload."); if (file.Length > MaxFileSizeBytes) return BadRequest("CV import file is too large. Keep it under 5 MB."); var extension = Path.GetExtension(file.FileName ?? string.Empty); if (!AllowedExtensions.Contains(extension)) { return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now."); } var artifact = await SaveUploadArtifactAsync(user, file, HttpContext.RequestAborted); _db.CvUploadArtifacts.Add(artifact); await _db.SaveChangesAsync(HttpContext.RequestAborted); var run = new CvExtractionRun { OwnerUserId = user.Id, ArtifactId = artifact.Id, Trigger = "upload", ParserVersion = ParserVersion, NormalizerVersion = NormalizerVersion, LlmPromptVersion = LlmPromptVersion, Status = "running", StartedAtUtc = DateTimeOffset.UtcNow, }; _db.CvExtractionRuns.Add(run); await _db.SaveChangesAsync(HttpContext.RequestAborted); try { var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted); result.StructuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1; result.StructuredCv.Metadata.AppliedExtractionRunId = run.Id; result.StructuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow; var structuredJson = StructuredCvProfileJson.Serialize(result.StructuredCv); run.RawExtractedText = result.RawText; run.NormalizedText = result.NormalizedText; run.StructuredProfileJson = structuredJson; run.Status = "applied"; run.CompletedAtUtc = DateTimeOffset.UtcNow; run.AppliedAtUtc = run.CompletedAtUtc; user.ProfileCvText = result.NormalizedText; user.ProfileCvStructureJson = structuredJson; user.CurrentCvUploadArtifactId = artifact.Id; user.CurrentCvExtractionRunId = run.Id; user.CurrentCvProfileVersion = result.StructuredCv.Metadata.ProfileVersion; var update = await _users.UpdateAsync(user); if (!update.Succeeded) { run.Status = "failed"; run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description)); await _db.SaveChangesAsync(HttpContext.RequestAborted); return BadRequest(run.ErrorMessage); } await _db.SaveChangesAsync(HttpContext.RequestAborted); return Ok(new { imported = true, characters = result.NormalizedText.Length, structuredCv = result.StructuredCv, sections = result.StructuredCv.Sections, artifactId = artifact.Id, extractionRunId = run.Id, profileVersion = result.StructuredCv.Metadata.ProfileVersion, }); } catch (Exception ex) { run.Status = "failed"; run.ErrorMessage = ex.Message; run.CompletedAtUtc = DateTimeOffset.UtcNow; await _db.SaveChangesAsync(HttpContext.RequestAborted); throw; } } [HttpGet("runs")] public async Task>> GetRuns() { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); var runs = await _db.CvExtractionRuns .AsNoTracking() .Where(x => x.OwnerUserId == user.Id) .OrderByDescending(x => x.StartedAtUtc) .Take(10) .Select(x => new CvExtractionRunListItem( x.Id, x.Trigger, x.Status, x.Artifact != null ? x.Artifact.OriginalFileName : null, x.StartedAtUtc, x.CompletedAtUtc, x.AppliedAtUtc, x.ParserVersion, x.NormalizerVersion, x.LlmPromptVersion, x.ErrorMessage)) .ToListAsync(HttpContext.RequestAborted); return Ok(runs); } [HttpPost("reprocess")] public async Task Reprocess() { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); var artifact = await _db.CvUploadArtifacts .OrderByDescending(x => x.UploadedAtUtc) .FirstOrDefaultAsync(x => x.OwnerUserId == user.Id, HttpContext.RequestAborted); if (artifact is null) return BadRequest("Upload a CV before reprocessing it."); if (string.IsNullOrWhiteSpace(artifact.StoragePath) || !System.IO.File.Exists(artifact.StoragePath)) { return BadRequest("The stored CV artifact could not be found for reprocessing."); } await using var stream = System.IO.File.OpenRead(artifact.StoragePath); var file = new FormFile(stream, 0, stream.Length, "file", artifact.OriginalFileName) { Headers = new HeaderDictionary(), ContentType = artifact.MimeType }; var extension = Path.GetExtension(artifact.OriginalFileName ?? string.Empty); var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted); await ApplyTextExtractionRunAsync(user, "reprocess", result.RawText, result.NormalizedText, result.StructuredCv, artifact.Id, HttpContext.RequestAborted); return Ok(new { reprocessed = true, artifactId = artifact.Id, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion, structuredCv = result.StructuredCv, sections = result.StructuredCv.Sections, }); } [HttpPost("rebuild")] public async Task Rebuild() { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before rebuilding it."); var rebuilt = await _aiService.SummarizeSectionAsync( "Rewrite this CV into a stronger master CV with clear sections such as Professional Summary, Core Skills, Experience Highlights, and Selected Achievements. Preserve only factual claims, avoid inventing employers or metrics, and make the output clean and ready for tailoring to job applications. Return only the rebuilt CV text.", user.ProfileCvText, 2200, 700); if (string.IsNullOrWhiteSpace(rebuilt)) { return BadRequest("The AI service could not rebuild your CV text right now."); } user.ProfileCvText = rebuilt.Trim(); var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted); await ApplyTextExtractionRunAsync(user, "rebuild", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted); return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion }); } [HttpPost("rewrite-section")] public async Task RewriteSection([FromBody] RewriteSectionRequest request) { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before rewriting a section."); var sectionName = string.IsNullOrWhiteSpace(request.SectionName) ? "Professional Summary" : request.SectionName.Trim(); var style = string.IsNullOrWhiteSpace(request.Style) ? "balanced" : request.Style.Trim(); var targetRole = string.IsNullOrWhiteSpace(request.TargetRole) ? null : request.TargetRole.Trim(); var rewritten = await _aiService.SummarizeSectionAsync( $"Rewrite only the '{sectionName}' section of this CV. Preserve facts, avoid inventing employers or metrics, and output only the rewritten section text. Style: {style}. {(targetRole is not null ? $"Target role: {targetRole}." : "Make it broadly reusable for future tailoring.")}", user.ProfileCvText, 900, 180); if (string.IsNullOrWhiteSpace(rewritten)) { return BadRequest("The AI service could not rewrite that CV section right now."); } return Ok(new { sectionName, style, targetRole, text = rewritten.Trim() }); } [HttpPost("parse")] public async Task> Parse([FromBody] ParseCvRequest? request) { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); var source = string.IsNullOrWhiteSpace(request?.Text) ? user.ProfileCvText : request!.Text; if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections."); var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted); if (string.IsNullOrWhiteSpace(request?.Text)) { user.ProfileCvText = source; } await ApplyTextExtractionRunAsync(user, "parse", source, source, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted); return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source), extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion }); } [HttpPost("improve")] public async Task Improve() { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); if (string.IsNullOrWhiteSpace(user.ProfileCvText)) return BadRequest("Add or import CV text before improving it."); var improved = await _aiService.SummarizeSectionAsync( "Rewrite this CV into a cleaner, better-structured master CV profile. Preserve factual claims, employers, skills, and measurable results. Improve clarity, tighten wording, use strong bullet-style phrasing, and keep it ready for further tailoring to specific roles. Return only the improved CV text.", user.ProfileCvText, 1800, 500); if (string.IsNullOrWhiteSpace(improved)) { return BadRequest("The AI service could not improve your CV text right now."); } user.ProfileCvText = improved.Trim(); var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted); await ApplyTextExtractionRunAsync(user, "improve", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted); return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion }); } private async Task BuildStructuredCvAsync(string text, CancellationToken cancellationToken) { var parseSource = NormalizeTextForStructuredParsing(text); var parsedSections = ParseSections(parseSource) .Select(section => new StructuredCvSection { Name = section.Name, Content = section.Content, WordCount = CountWords(section.Content), }) .ToList(); var hasRealSections = parsedSections.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase)); List classifiedBlocks = new(); List fallbackSections = parsedSections; StructuredCvProfile? classifierFallback = null; if (!hasRealSections) { classifiedBlocks = await ClassifyBlocksAsync(parseSource, cancellationToken); var hasMeaningfulClassifierStructure = classifiedBlocks.Any(block => !string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase)); if (hasMeaningfulClassifierStructure) { fallbackSections = BuildSectionsFromClassifiedBlocks(classifiedBlocks); classifierFallback = BuildStructuredCvFromClassifiedBlocks(classifiedBlocks); } } var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections); AnnotateStructuredCv(sectionFallback, "repair", 0.56); var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text); AnnotateStructuredCv(heuristicFallback, "deterministic", 0.68); heuristicFallback.Sections = new List(); var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback); if (classifierFallback is not null) { fallback = StructuredCvProfileJson.Merge(classifierFallback, fallback); } fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email); var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken); var merged = StructuredCvProfileJson.Merge(extracted, fallback); merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email); return StructuredCvProfileJson.Normalize(merged); } private async Task SaveUploadArtifactAsync(ApplicationUser user, IFormFile file, CancellationToken cancellationToken) { var extension = Path.GetExtension(file.FileName ?? string.Empty); var userRoot = Path.Combine(_paths.CvArtifactsRoot, user.Id); Directory.CreateDirectory(userRoot); var storedFileName = $"{DateTimeOffset.UtcNow:yyyyMMddHHmmss}-{Guid.NewGuid():N}{extension}"; var storagePath = Path.Combine(userRoot, storedFileName); await using (var target = System.IO.File.Create(storagePath)) await using (var source = file.OpenReadStream()) { await source.CopyToAsync(target, cancellationToken); } await using var hashStream = System.IO.File.OpenRead(storagePath); var shaBytes = await SHA256.HashDataAsync(hashStream, cancellationToken); return new CvUploadArtifact { OwnerUserId = user.Id, OriginalFileName = file.FileName ?? storedFileName, StoredFileName = storedFileName, MimeType = file.ContentType ?? "application/octet-stream", ByteSize = file.Length, Sha256 = Convert.ToHexString(shaBytes), StoragePath = storagePath, UploadedAtUtc = DateTimeOffset.UtcNow, }; } private async Task ExtractStructuredCvFromFileAsync(IFormFile file, string extension, CancellationToken cancellationToken) { string text; var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase); if (canUseAiExtraction) { await using var uploadStream = file.OpenReadStream(); var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, cancellationToken); text = extracted?.Text?.Trim() ?? string.Empty; } else { text = string.Empty; } if (string.IsNullOrWhiteSpace(text)) { text = (await ExtractTextAsync(file, extension)).Trim(); } if (string.IsNullOrWhiteSpace(text)) { throw new InvalidOperationException("The uploaded CV file could not be read or was empty."); } var normalizedText = (await MaybeReconstructStructuredCvAsync(text, cancellationToken)).Trim(); var structuredCv = await BuildStructuredCvAsync(normalizedText, cancellationToken); return new ExtractionPipelineResult(text, normalizedText, structuredCv); } private async Task ApplyTextExtractionRunAsync(ApplicationUser user, string trigger, string rawText, string normalizedText, StructuredCvProfile structuredCv, int? artifactId, CancellationToken cancellationToken) { var run = new CvExtractionRun { OwnerUserId = user.Id, ArtifactId = artifactId, Trigger = trigger, ParserVersion = ParserVersion, NormalizerVersion = NormalizerVersion, LlmPromptVersion = LlmPromptVersion, Status = "applied", RawExtractedText = rawText, NormalizedText = normalizedText, StartedAtUtc = DateTimeOffset.UtcNow, CompletedAtUtc = DateTimeOffset.UtcNow, AppliedAtUtc = DateTimeOffset.UtcNow, }; _db.CvExtractionRuns.Add(run); await _db.SaveChangesAsync(cancellationToken); structuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1; structuredCv.Metadata.AppliedExtractionRunId = run.Id; structuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow; var structuredJson = StructuredCvProfileJson.Serialize(structuredCv); run.StructuredProfileJson = structuredJson; user.ProfileCvText = normalizedText; user.ProfileCvStructureJson = structuredJson; user.CurrentCvExtractionRunId = run.Id; user.CurrentCvProfileVersion = structuredCv.Metadata.ProfileVersion; if (artifactId.HasValue) { user.CurrentCvUploadArtifactId = artifactId.Value; } var update = await _users.UpdateAsync(user); if (!update.Succeeded) { run.Status = "failed"; run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description)); await _db.SaveChangesAsync(cancellationToken); throw new InvalidOperationException(run.ErrorMessage); } await _db.SaveChangesAsync(cancellationToken); } private static void AnnotateStructuredCv(StructuredCvProfile profile, string method, double confidence) { var now = DateTimeOffset.UtcNow; profile.Metadata ??= new StructuredCvMetadata(); profile.Metadata.Fields ??= new Dictionary(); void SetIf(string key, string? value) { if (string.IsNullOrWhiteSpace(value)) return; profile.Metadata.Fields[key] = new StructuredCvFieldMetadata { Confidence = confidence, Method = method, SourceSnippet = value.Length > 180 ? value[..180] : value, ReviewState = "suggested", LastUpdatedAtUtc = now, }; } SetIf("contact.fullName", profile.Contact.FullName); SetIf("contact.headline", profile.Contact.Headline); SetIf("contact.email", profile.Contact.Email); SetIf("contact.phone", profile.Contact.Phone); SetIf("contact.location", profile.Contact.Location); SetIf("contact.website", profile.Contact.Website); SetIf("contact.linkedIn", profile.Contact.LinkedIn); SetIf("summary", profile.Summary.FirstOrDefault()); SetIf("skills", profile.Skills.FirstOrDefault()); SetIf("languages", profile.Languages.FirstOrDefault()?.Name); SetIf("interests", profile.Interests.FirstOrDefault()); SetIf("jobs", profile.Jobs.FirstOrDefault()?.Title ?? profile.Jobs.FirstOrDefault()?.Company); SetIf("education", profile.Education.FirstOrDefault()?.Qualification ?? profile.Education.FirstOrDefault()?.Institution); } private async Task TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken) { var structuredJson = await _aiService.SummarizeSectionAsync( "Extract this CV into structured JSON. Return only valid JSON with this exact top-level shape: { \"version\": \"1\", \"contact\": { \"fullName\": string|null, \"headline\": string|null, \"email\": string|null, \"phone\": string|null, \"location\": string|null, \"website\": string|null, \"linkedin\": string|null }, \"summary\": string[], \"jobs\": [{ \"title\": string|null, \"company\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"isCurrent\": boolean, \"bullets\": string[], \"skills\": string[] }], \"education\": [{ \"qualification\": string|null, \"institution\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"details\": string[] }], \"skills\": string[], \"languages\": [{ \"name\": string|null, \"level\": string|null, \"notes\": string|null }], \"interests\": string[], \"otherSections\": [{ \"title\": string|null, \"items\": string[] }] }. Preserve facts only. Do not invent anything. If a field is unknown, use null or an empty array. Keep wording close to the source. Put unmatched content in otherSections.", text, 3200, 900); if (string.IsNullOrWhiteSpace(structuredJson)) return null; var extracted = ExtractJsonObject(structuredJson); if (string.IsNullOrWhiteSpace(extracted)) return null; var parsed = StructuredCvProfileJson.Deserialize(extracted); if (!IsMeaningfullyStructured(parsed)) return null; AnnotateStructuredCv(parsed, "llm", 0.82); return parsed; } private static bool IsMeaningfullyStructured(StructuredCvProfile profile) { return !string.IsNullOrWhiteSpace(profile.Contact.FullName) || profile.Summary.Count > 0 || profile.Jobs.Count > 0 || profile.Education.Count > 0 || profile.Skills.Count > 0 || profile.Languages.Count > 0 || profile.Interests.Count > 0 || profile.OtherSections.Count > 0; } private static string? ExtractJsonObject(string raw) { var trimmed = raw.Trim(); if (trimmed.StartsWith("```", StringComparison.Ordinal)) { trimmed = Regex.Replace(trimmed, "^```(?:json)?\\s*|\\s*```$", string.Empty, RegexOptions.IgnoreCase); } var start = trimmed.IndexOf('{'); var end = trimmed.LastIndexOf('}'); if (start < 0 || end <= start) return null; return trimmed[start..(end + 1)]; } private static string? GuessFullName(string source) { var normalized = source.Replace("\r\n", "\n"); foreach (var line in normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(6)) { var cleaned = line.Trim().TrimStart('#').Trim(); if (cleaned.Length < 4 || cleaned.Length > 80) continue; if (cleaned.Contains('@') || Regex.IsMatch(cleaned, @"\d")) continue; if (!Regex.IsMatch(cleaned, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,4}$")) continue; return cleaned; } return null; } private static string? GuessFullNameFromEmail(string? email) { if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null; var localPart = email[..email.IndexOf('@')].Trim(); if (string.IsNullOrWhiteSpace(localPart)) return null; var parts = Regex.Split(localPart, @"[._-]+") .Select(part => part.Trim()) .Where(part => part.Length > 0) .Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant()) .ToList(); return parts.Count >= 2 ? string.Join(" ", parts) : null; } private static string NormalizeTextForStructuredParsing(string source) { if (string.IsNullOrWhiteSpace(source)) return string.Empty; var text = source.Replace("\r\n", "\n").Trim(); if (!LooksLikeFlattenedCvExtraction(text)) return text; text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match => { var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty); foreach (var alias in SectionAliases) { var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty); if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase)) { return $"\n\n## {alias.Value}\n"; } } return match.Value; }); foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length)) { text = Regex.Replace( text, $@"(? section.Name == "Professional Summary"); var flattenedSummary = Regex.Match( rawSource, @"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))", RegexOptions.IgnoreCase | RegexOptions.Singleline); if (flattenedSummary.Success) { profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5); } else if (!string.IsNullOrWhiteSpace(summarySection.Content)) { profile.Summary = SplitSentences(summarySection.Content, 5); } var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests"); if (!string.IsNullOrWhiteSpace(interestsSection.Content)) { profile.Interests = SplitListLike(interestsSection.Content); } else { var flattenedInterests = Regex.Match( rawSource, @"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))", RegexOptions.IgnoreCase | RegexOptions.Singleline); if (flattenedInterests.Success) { profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4); } } var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages"); if (!string.IsNullOrWhiteSpace(languagesSection.Content)) { profile.Languages = ParseLanguagesHeuristically(languagesSection.Content); } else { profile.Languages = ParseLanguagesHeuristically(rawSource); } var skills = new HashSet(StringComparer.OrdinalIgnoreCase); foreach (Match match in Regex.Matches(rawSource, @"(? section.Name == "Education"); if (!string.IsNullOrWhiteSpace(educationSection.Content)) { profile.Education = ParseEducationHeuristically(educationSection.Content); } var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience"); if (!string.IsNullOrWhiteSpace(experienceSection.Content)) { profile.Jobs = ParseJobsHeuristically(experienceSection.Content); } if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General")) { var general = sections.First(section => section.Name == "General"); if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0) { profile.Summary = SplitSentences(general.Content, 3); } } return StructuredCvProfileJson.Normalize(profile); } private static List SplitSentences(string content, int limit) { return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+") .Select(value => value.Trim()) .Where(value => value.Length > 20) .Take(limit) .ToList(); } private static List SplitListLike(string content) { return content .Replace("\r\n", "\n") .Split(new[] { '\n', ',', ';' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) .Select(item => item.Trim().TrimStart('-', '•', '*', ' ')) .Where(item => item.Length > 1) .Distinct(StringComparer.OrdinalIgnoreCase) .ToList(); } private static List ParseLanguagesHeuristically(string content) { var languages = new List(); var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+") .Select(item => item.Trim()) .Where(item => item.Length > 1); foreach (var candidate in candidates) { var level = HumanLanguageCatalog.ExtractLevel(candidate); if (level is null) continue; foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate)) { languages.Add(new StructuredCvLanguage { Name = name, Level = level }); } } return languages .GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase) .Select(group => group.First()) .ToList(); } private static List ParseEducationHeuristically(string content) { var blocks = Regex.Split(content, @"\n\s*\n") .Select(block => block.Trim()) .Where(block => block.Length > 0) .ToList(); var items = new List(); foreach (var block in blocks) { var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList(); if (lines.Count == 0) continue; var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-–]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase); var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' '); var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-–]")); if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0]; if (qualificationLine is null && institutionLine is null) continue; items.Add(new StructuredCvEducation { Qualification = TitleCasePreservingAcronyms(qualificationLine), Institution = TitleCasePreservingAcronyms(institutionLine), Start = dateMatch.Success ? dateMatch.Groups[1].Value : null, End = dateMatch.Success ? dateMatch.Groups[2].Value : null, Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(), }); } return items; } private static List ParseJobsHeuristically(string content) { var normalized = content.Replace("\r\n", "\n"); var pattern = new Regex(@"(?[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline); var jobs = new List<StructuredCvJob>(); foreach (Match match in pattern.Matches(normalized)) { var body = match.Groups["body"].Value.Trim(); var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value); var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-–]\s*"); var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6); jobs.Add(new StructuredCvJob { Title = TitleCasePreservingAcronyms(match.Groups["title"].Value), Company = employer, Start = NullIfWhitespace(dates.FirstOrDefault()), End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()), IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase), Bullets = bullets, Skills = bullets.SelectMany(SplitListLike).Where(item => Regex.IsMatch(item, @"^(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)$", RegexOptions.IgnoreCase)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(), }); } return jobs; } private static string? TitleCasePreservingAcronyms(string? value) { if (string.IsNullOrWhiteSpace(value)) return null; var words = value.Trim() .Split(' ', StringSplitOptions.RemoveEmptyEntries) .Select(word => word.Length <= 3 && word.All(char.IsUpper) ? word : char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()) .ToArray(); return string.Join(" ", words); } private static int CountWords(string? text) { if (string.IsNullOrWhiteSpace(text)) return 0; return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length; } private static string? NullIfWhitespace(string? value) { return string.IsNullOrWhiteSpace(value) ? null : value.Trim(); } private static List<(string Name, string Content)> ParseSections(string source) { var lines = source.Replace("\r\n", "\n").Split('\n'); var sections = new List<(string Name, List<string> Lines)>(); var currentName = "General"; var currentLines = new List<string>(); void Flush() { var content = string.Join("\n", currentLines).Trim(); if (!string.IsNullOrWhiteSpace(content)) { sections.Add((currentName, new List<string>(currentLines))); } currentLines.Clear(); } foreach (var raw in lines) { var line = raw.Trim(); var canonicalHeading = CanonicalizeSectionHeading(line); if (canonicalHeading is not null) { Flush(); currentName = canonicalHeading; continue; } currentLines.Add(raw); } Flush(); if (sections.Count == 0) { return new List<(string Name, string Content)> { ("General", source.Trim()) }; } return sections .Select(section => (section.Name, string.Join("\n", section.Lines).Trim())) .Where(section => !string.IsNullOrWhiteSpace(section.Item2)) .ToList(); } private static List<StructuredCvSection> BuildSectionsFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks) { var sectionBuckets = new List<StructuredCvSection>(); foreach (var block in classifiedBlocks) { var existing = sectionBuckets.FirstOrDefault(section => section.Name == block.SectionName); if (existing is null) { sectionBuckets.Add(new StructuredCvSection { Name = block.SectionName, Content = block.Content, WordCount = CountWords(block.Content) }); } else { existing.Content = $"{existing.Content}\n\n{block.Content}".Trim(); existing.WordCount = CountWords(existing.Content); } } return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList(); } private static StructuredCvProfile BuildStructuredCvFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks) { var profile = new StructuredCvProfile(); var now = DateTimeOffset.UtcNow; var summary = new List<string>(); var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase); foreach (var block in classifiedBlocks) { switch (block.SectionName) { case "Professional Summary": foreach (var item in SplitClassifierContent(block.Content, 5)) { summary.Add(item); } ApplyClassifierFieldMetadata(profile, "summary", summary.FirstOrDefault(), block, now); break; case "Skills": foreach (var item in SplitClassifierSkills(block.Content)) { skills.Add(item); } ApplyClassifierFieldMetadata(profile, "skills", skills.FirstOrDefault(), block, now); break; case "Work Experience": var job = BuildJobFromClassifiedBlock(block); if (job is not null) { var index = profile.Jobs.Count; profile.Jobs.Add(job); ApplyClassifierFieldMetadata(profile, $"jobs[{index}].title", job.Title, block, now); ApplyClassifierFieldMetadata(profile, $"jobs[{index}].company", job.Company, block, now); ApplyClassifierFieldMetadata(profile, $"jobs[{index}].location", job.Location, block, now); } break; case "Education": var education = BuildEducationFromClassifiedBlock(block); if (education is not null) { var index = profile.Education.Count; profile.Education.Add(education); ApplyClassifierFieldMetadata(profile, $"education[{index}].qualification", education.Qualification, block, now); ApplyClassifierFieldMetadata(profile, $"education[{index}].institution", education.Institution, block, now); } break; default: if (!string.IsNullOrWhiteSpace(block.Content)) { profile.OtherSections.Add(new StructuredCvOtherSection { Title = block.SectionName, Items = SplitClassifierContent(block.Content, 6) }); } break; } } profile.Summary = summary.Distinct(StringComparer.OrdinalIgnoreCase).ToList(); profile.Skills = skills.ToList(); profile.Sections = BuildSectionsFromClassifiedBlocks(classifiedBlocks); var averageConfidence = classifiedBlocks .Select(block => block.Classification?.Confidence) .Where(value => value.HasValue) .Select(value => value!.Value) .DefaultIfEmpty(0.74) .Average(); AnnotateStructuredCv(profile, "classifier", averageConfidence); return StructuredCvProfileJson.Normalize(profile); } private static StructuredCvJob? BuildJobFromClassifiedBlock(ClassifiedCvBlock block) { var classification = block.Classification; if (classification is null) return null; var bullets = classification.Bullets is { Count: > 0 } ? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList() : SplitClassifierContent(block.OriginalBlock, 6); var job = new StructuredCvJob { Title = NullIfWhitespace(classification.Title), Company = NullIfWhitespace(classification.Company), Location = NullIfWhitespace(classification.Location), Start = NullIfWhitespace(classification.Start), End = NullIfWhitespace(classification.End), IsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase), Bullets = bullets, Skills = SplitClassifierSkills(block.OriginalBlock) }; return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Jobs = new List<StructuredCvJob> { job } }).Jobs.FirstOrDefault(); } private static StructuredCvEducation? BuildEducationFromClassifiedBlock(ClassifiedCvBlock block) { var classification = block.Classification; if (classification is null) return null; var education = new StructuredCvEducation { Qualification = NullIfWhitespace(classification.Title), Institution = NullIfWhitespace(classification.Company), Location = NullIfWhitespace(classification.Location), Start = NullIfWhitespace(classification.Start), End = NullIfWhitespace(classification.End), Details = classification.Bullets is { Count: > 0 } ? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList() : SplitClassifierContent(block.OriginalBlock, 5) }; return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Education = new List<StructuredCvEducation> { education } }).Education.FirstOrDefault(); } private static List<string> SplitClassifierContent(string content, int limit) { return content .Replace("\r\n", "\n") .Split(new[] { '\n', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) .SelectMany(line => line.Contains(". ", StringComparison.Ordinal) ? Regex.Split(line, @"(?<=[.!?])\s+") : new[] { line }) .Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' ')) .Where(item => item.Length > 2) .Take(limit) .Distinct(StringComparer.OrdinalIgnoreCase) .ToList(); } private static List<string> SplitClassifierSkills(string content) { return content .Replace("\r\n", "\n") .Split(new[] { '\n', ',', ';', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) .Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' ')) .Where(item => item.Length > 1 && item.Length <= 48 && !LooksLikeDateLikeValue(item) && !item.Contains('@')) .Distinct(StringComparer.OrdinalIgnoreCase) .ToList(); } private static bool LooksLikeDateLikeValue(string value) { return Regex.IsMatch(value, @"^(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current))?$", RegexOptions.IgnoreCase); } private static void ApplyClassifierFieldMetadata(StructuredCvProfile profile, string key, string? value, ClassifiedCvBlock block, DateTimeOffset now) { if (string.IsNullOrWhiteSpace(value)) return; profile.Metadata.Fields[key] = new StructuredCvFieldMetadata { Confidence = block.Classification?.Confidence ?? 0.74, Method = "classifier", SourceSnippet = block.OriginalBlock.Length > 180 ? block.OriginalBlock[..180] : block.OriginalBlock, SourceBlockId = $"block-{block.Index}", ReviewState = string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase) ? "needs-review" : "suggested", LastUpdatedAtUtc = now, }; } private async Task<List<ClassifiedCvBlock>> ClassifyBlocksAsync(string parseSource, CancellationToken cancellationToken) { var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n") .Select(block => block.Trim()) .Where(block => block.Length >= 24) .ToList(); if (blocks.Count == 0) return new List<ClassifiedCvBlock>(); var results = new List<ClassifiedCvBlock>(); for (var index = 0; index < blocks.Count; index++) { var block = blocks[index]; var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken); var sectionName = classification?.Section; if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical)) { sectionName = canonical; } if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase)) { sectionName = "General"; } var content = block; if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null) { var lines = new List<string>(); if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}"); var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase); var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent); var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value))); if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta); if (classification.Bullets is not null) { lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}")); } if (lines.Count > 0) content = string.Join("\n", lines); } else if (string.Equals(sectionName, "Education", StringComparison.OrdinalIgnoreCase) && classification is not null) { var lines = new List<string>(); if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}"); var dateRange = FormatDateRangeForSection(classification.Start, classification.End, false); var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value))); if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta); if (classification.Bullets is not null) { lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}")); } if (lines.Count > 0) content = string.Join("\n", lines); } else if (string.Equals(sectionName, "Skills", StringComparison.OrdinalIgnoreCase)) { var items = SplitClassifierSkills(block); if (items.Count > 0) content = string.Join("\n", items); } else if (string.Equals(sectionName, "Professional Summary", StringComparison.OrdinalIgnoreCase) && classification?.Bullets is { Count: > 0 }) { content = string.Join("\n", classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}")); } results.Add(new ClassifiedCvBlock(index + 1, block, sectionName, content, classification)); } return results; } private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent) { if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null; if (string.IsNullOrWhiteSpace(start)) return end; return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}"; } private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken) { var normalized = text.Trim(); if (!LooksLikeFlattenedCvExtraction(normalized)) { return normalized; } var reconstructed = await _aiService.SummarizeSectionAsync( "Reconstruct this CV text extracted from a PDF into a clean, readable master CV in markdown. Preserve facts only. Recover clear sections such as Contact, Professional Summary, Work Experience, Education, Skills, Languages, and Interests when present. Split contact details onto their own lines, turn noisy all-caps/spaced headings into normal headings, keep dates with the correct roles and employers, and remove layout/OCR artifacts. Do not invent employers, titles, dates, or metrics. Return only the reconstructed CV text.", normalized, 2800, 900); return string.IsNullOrWhiteSpace(reconstructed) ? normalized : reconstructed.Trim(); } private static bool LooksLikeFlattenedCvExtraction(string text) { if (string.IsNullOrWhiteSpace(text)) return false; var normalized = text.Replace("\r\n", "\n"); var lineCount = normalized.Split('\n').Count(line => !string.IsNullOrWhiteSpace(line)); var spacedHeadingCount = Regex.Matches(normalized, @"\b(?:[A-Z]\s){3,}[A-Z]\b").Count; var knownHeadingHits = SectionAliases.Keys.Count(alias => normalized.Contains(alias, StringComparison.OrdinalIgnoreCase)); var bulletCount = Regex.Matches(normalized, @"[•●▪◦]").Count; return (lineCount <= 6 && normalized.Length >= 500) || spacedHeadingCount >= 3 || (knownHeadingHits >= 3 && lineCount <= 12) || (normalized.Contains(" + ") && bulletCount > 0 && lineCount <= 10); } private static string? CanonicalizeSectionHeading(string line) { if (string.IsNullOrWhiteSpace(line)) return null; var normalized = line.Trim(); if (normalized.StartsWith("#", StringComparison.Ordinal)) { normalized = normalized.TrimStart('#').Trim(); } normalized = normalized.TrimEnd(':').Trim(); if (normalized.Length == 0 || normalized.Length > 60) return null; if (normalized.Contains('.') || normalized.Contains(" ")) return null; return SectionAliases.TryGetValue(normalized, out var canonical) ? canonical : null; } private static async Task<string> ExtractTextAsync(IFormFile file, string extension) { if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)) { using var stream = file.OpenReadStream(); using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true); return (await reader.ReadToEndAsync()).Trim(); } await using var memory = new MemoryStream(); await file.CopyToAsync(memory); var bytes = memory.ToArray(); if (string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)) { var raw = Encoding.UTF8.GetString(bytes); var textMatches = Regex.Matches(raw, @"\((.*?)\)Tj", RegexOptions.Singleline) .Select(match => match.Groups[1].Value) .Concat(Regex.Matches(raw, @"\[(.*?)\]TJ", RegexOptions.Singleline) .SelectMany(match => Regex.Matches(match.Groups[1].Value, @"\((.*?)\)", RegexOptions.Singleline).Select(x => x.Groups[1].Value))) .Where(value => !string.IsNullOrWhiteSpace(value)) .Select(value => Regex.Unescape(value)) .ToList(); var joined = textMatches.Count > 0 ? string.Join(" ", textMatches) : raw; var scrubbed = Regex.Replace(joined, @"[\x00-\x08\x0B\x0C\x0E-\x1F]", " "); return Regex.Replace(scrubbed, @"\s+", " ").Trim(); } if (string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)) { using var archive = new System.IO.Compression.ZipArchive(new MemoryStream(bytes), System.IO.Compression.ZipArchiveMode.Read, leaveOpen: false); var entry = archive.GetEntry("word/document.xml"); if (entry is null) return string.Empty; using var entryStream = entry.Open(); using var reader = new StreamReader(entryStream, Encoding.UTF8); var xml = await reader.ReadToEndAsync(); var withoutTags = Regex.Replace(xml, "<[^>]+>", " "); var decoded = System.Net.WebUtility.HtmlDecode(withoutTags) ?? string.Empty; return Regex.Replace(decoded, @"\s+", " ").Trim(); } return string.Empty; } }