From 8f8a34ad9c578e403689afea9dbaa36c075b4362 Mon Sep 17 00:00:00 2001 From: cesnimda Date: Sat, 28 Mar 2026 15:01:32 +0100 Subject: [PATCH] Add typed structured CV extraction --- .../ProfileCvControllerTests.cs | 220 +++++++- .../Controllers/JobApplicationsController.cs | 116 +++-- .../Controllers/ProfileCvController.cs | 211 ++++++-- Models/StructuredCvProfile.cs | 68 +++ Models/StructuredCvProfileJson.cs | 491 ++++++++++++++++++ 5 files changed, 1029 insertions(+), 77 deletions(-) create mode 100644 Models/StructuredCvProfile.cs create mode 100644 Models/StructuredCvProfileJson.cs diff --git a/JobTrackerApi.Tests/ProfileCvControllerTests.cs b/JobTrackerApi.Tests/ProfileCvControllerTests.cs index 0b42bcd..76ba7a6 100644 --- a/JobTrackerApi.Tests/ProfileCvControllerTests.cs +++ b/JobTrackerApi.Tests/ProfileCvControllerTests.cs @@ -1,12 +1,12 @@ using System.Security.Claims; using System.Text; +using System.Text.Json; using JobTrackerApi.Controllers; using JobTrackerApi.Models; using JobTrackerApi.Services; using Microsoft.AspNetCore.Http; using Microsoft.AspNetCore.Identity; using Microsoft.AspNetCore.Mvc; -using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; using Moq; @@ -36,6 +36,200 @@ public sealed class ProfileCvControllerTests Assert.True((badRequest.Value?.ToString() ?? string.Empty).Contains("supported", StringComparison.OrdinalIgnoreCase)); } + [Fact] + public async Task Upload_reconstructs_flattened_pdf_cv_before_save() + { + var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government. I N T E R E S T S E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1."; + var reconstructed = "# Connor Babbington\n\n## Contact\nconnor.babbington@cesnimda.co.uk\ncesnimda.co.uk\n+47 41 33 44 70\nTønsberg, Norway\n\n## Professional Summary\nMid-level system developer with eight years of experience in UK local government.\n\n## Work Experience\n### System Developer\nWarwickshire County Council\nUK\n2015 - 2023\n- Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript.\n\n## Education\n### Extended Diploma NVQ Level 3 in ICT\nWarwickshire College\n2012 - 2015\n\n## Languages\nEnglish: Native\nNorwegian: A2/B1"; + var structuredJson = """ + { + "version": "1", + "contact": { + "fullName": "Connor Babbington", + "email": "connor.babbington@cesnimda.co.uk", + "phone": "+47 41 33 44 70", + "location": "Tønsberg, Norway", + "website": "cesnimda.co.uk" + }, + "summary": ["Mid-level system developer with eight years of experience in UK local government."], + "jobs": [ + { + "title": "System Developer", + "company": "Warwickshire County Council", + "location": "UK", + "start": "2015", + "end": "2023", + "isCurrent": false, + "bullets": ["Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript."], + "skills": ["C#", "Python", "Ruby on Rails", "SQL", "JavaScript"] + } + ], + "education": [ + { + "qualification": "Extended Diploma NVQ Level 3 in ICT", + "institution": "Warwickshire College", + "start": "2012", + "end": "2015", + "details": [] + } + ], + "skills": ["C#", "Python", "Ruby on Rails", "SQL", "JavaScript"], + "languages": [ + { "name": "English", "level": "Native" }, + { "name": "Norwegian", "level": "A2/B1" } + ], + "interests": [], + "otherSections": [] + } + """; + + var user = new ApplicationUser(); + var userManager = CreateUserManager(); + userManager.Setup(x => x.GetUserAsync(It.IsAny())).ReturnsAsync(user); + userManager.Setup(x => x.UpdateAsync(user)).ReturnsAsync(IdentityResult.Success); + var aiService = new Mock(); + aiService + .Setup(x => x.ExtractTextAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) + .ReturnsAsync(new AiTextExtractionResult(rawExtraction, false, "application/pdf", 1, rawExtraction.Length, "Resume.en.pdf")); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), rawExtraction, 2800, 900)) + .ReturnsAsync(reconstructed); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), reconstructed, 3200, 900)) + .ReturnsAsync(structuredJson); + + var controller = new ProfileCvController(userManager.Object, aiService.Object) + { + ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() } + }; + + var bytes = Encoding.UTF8.GetBytes("fake pdf bytes"); + var file = new FormFile(new MemoryStream(bytes), 0, bytes.Length, "file", "Resume.en.pdf") + { + Headers = new HeaderDictionary(), + ContentType = "application/pdf" + }; + + var result = await controller.Upload(file); + + Assert.IsType(result); + Assert.Equal(reconstructed, user.ProfileCvText); + + var structured = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson); + Assert.Equal("Connor Babbington", structured.Contact.FullName); + Assert.Single(structured.Summary); + Assert.Single(structured.Jobs); + Assert.Equal("System Developer", structured.Jobs[0].Title); + Assert.Single(structured.Education); + Assert.Equal("Extended Diploma NVQ Level 3 in ICT", structured.Education[0].Qualification); + Assert.Contains(structured.Sections, section => section.Name == "Contact"); + Assert.Contains(structured.Sections, section => section.Name == "Professional Summary"); + Assert.Contains(structured.Sections, section => section.Name == "Work Experience"); + Assert.Contains(structured.Sections, section => section.Name == "Education"); + } + + [Fact] + public async Task Parse_returns_structured_cv_and_persists_it() + { + var user = new ApplicationUser + { + ProfileCvText = "# Connor Babbington\n\n## Contact\nconnor@example.com\n+47 41 33 44 70\n\n## Professional Summary\nBuilt backend systems.\n\n## Work Experience\n### System Developer\nWarwickshire County Council\n2015 - 2023\n- Built APIs\n\n## Education\n### Warwickshire College\n2012 - 2015" + }; + var structuredJson = """ + { + "version": "1", + "contact": { + "fullName": "Connor Babbington", + "email": "connor@example.com", + "phone": "+47 41 33 44 70" + }, + "summary": ["Built backend systems."], + "jobs": [ + { + "title": "System Developer", + "company": "Warwickshire County Council", + "start": "2015", + "end": "2023", + "isCurrent": false, + "bullets": ["Built APIs"], + "skills": [".NET"] + } + ], + "education": [ + { + "qualification": "Warwickshire College", + "start": "2012", + "end": "2015", + "details": [] + } + ], + "skills": [".NET"], + "languages": [], + "interests": [], + "otherSections": [] + } + """; + + var userManager = CreateUserManager(); + userManager.Setup(x => x.GetUserAsync(It.IsAny())).ReturnsAsync(user); + userManager.Setup(x => x.UpdateAsync(user)).ReturnsAsync(IdentityResult.Success); + var aiService = new Mock(); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), user.ProfileCvText, 3200, 900)) + .ReturnsAsync(structuredJson); + + var controller = new ProfileCvController(userManager.Object, aiService.Object) + { + ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() } + }; + + var result = await controller.Parse(new ProfileCvController.ParseCvRequest(user.ProfileCvText)); + + var ok = Assert.IsType(result.Result); + var json = JsonSerializer.Serialize(ok.Value); + Assert.Contains("structuredCv", json, StringComparison.OrdinalIgnoreCase); + Assert.Contains("Connor Babbington", json); + Assert.Contains("System Developer", json); + + var structured = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson); + Assert.Equal("Connor Babbington", structured.Contact.FullName); + Assert.Single(structured.Jobs); + Assert.Equal("System Developer", structured.Jobs[0].Title); + } + + [Fact] + public async Task Parse_falls_back_to_section_parsing_when_ai_json_is_invalid() + { + var user = new ApplicationUser + { + ProfileCvText = "# Connor Babbington\n\n## Professional Summary\nBuilt backend systems.\n\n## Skills\n.NET\nSQL\nAzure" + }; + var userManager = CreateUserManager(); + userManager.Setup(x => x.GetUserAsync(It.IsAny())).ReturnsAsync(user); + userManager.Setup(x => x.UpdateAsync(user)).ReturnsAsync(IdentityResult.Success); + var aiService = new Mock(); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), user.ProfileCvText, 3200, 900)) + .ReturnsAsync("not-json"); + + var controller = new ProfileCvController(userManager.Object, aiService.Object) + { + ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() } + }; + + var result = await controller.Parse(new ProfileCvController.ParseCvRequest(user.ProfileCvText)); + + var ok = Assert.IsType(result.Result); + var json = JsonSerializer.Serialize(ok.Value); + Assert.Contains("Professional Summary", json); + + var structured = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson); + Assert.Contains("Built backend systems.", structured.Summary); + Assert.Contains(".NET", structured.Skills); + Assert.Contains("SQL", structured.Skills); + Assert.Equal("Connor Babbington", structured.Contact.FullName); + } + [Fact] public async Task Upload_accepts_markdown_cv_and_saves_text() { @@ -46,18 +240,38 @@ public sealed class ProfileCvControllerTests var aiService = new Mock(); aiService .Setup(x => x.ExtractTextAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) - .ReturnsAsync(new AiTextExtractionResult("# CV\nBuilt APIs and UIs", false, "text/markdown", null, 22, "resume.md")); + .ReturnsAsync(new AiTextExtractionResult("# Connor Babbington\n\n## Professional Summary\nBuilt APIs and UIs", false, "text/markdown", null, 62, "resume.md")); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny(), 3200, 900)) + .ReturnsAsync(""" + { + "version":"1", + "contact":{"fullName":"Connor Babbington"}, + "summary":["Built APIs and UIs"], + "jobs":[], + "education":[], + "skills":[], + "languages":[], + "interests":[], + "otherSections":[] + } + """); var controller = new ProfileCvController(userManager.Object, aiService.Object) { ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() } }; - var file = new FormFile(new MemoryStream(Encoding.UTF8.GetBytes("# CV\nBuilt APIs and UIs")), 0, 23, "file", "resume.md"); + var file = new FormFile(new MemoryStream(Encoding.UTF8.GetBytes("# Connor Babbington\n\n## Professional Summary\nBuilt APIs and UIs")), 0, 62, "file", "resume.md") + { + Headers = new HeaderDictionary(), + ContentType = "text/markdown" + }; var result = await controller.Upload(file); Assert.IsType(result); Assert.Contains("Built APIs", user.ProfileCvText); + Assert.Equal("Connor Babbington", StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson).Contact.FullName); } private static Mock> CreateUserManager() diff --git a/JobTrackerApi/Controllers/JobApplicationsController.cs b/JobTrackerApi/Controllers/JobApplicationsController.cs index 6129517..916061f 100644 --- a/JobTrackerApi/Controllers/JobApplicationsController.cs +++ b/JobTrackerApi/Controllers/JobApplicationsController.cs @@ -58,45 +58,99 @@ namespace JobTrackerApi.Controllers return "Hi there,"; } - private sealed record CvSectionRecord(string? Name, string? Content, int? WordCount); - private static string BuildStructuredCvContext(ApplicationUser? user) { - if (string.IsNullOrWhiteSpace(user?.ProfileCvStructureJson)) return string.Empty; + var structured = StructuredCvProfileJson.Deserialize(user?.ProfileCvStructureJson); + var blocks = new List(); - try + var contactLines = new List(); + if (!string.IsNullOrWhiteSpace(structured.Contact.FullName)) contactLines.Add($"Name: {structured.Contact.FullName}"); + if (!string.IsNullOrWhiteSpace(structured.Contact.Headline)) contactLines.Add($"Headline: {structured.Contact.Headline}"); + if (!string.IsNullOrWhiteSpace(structured.Contact.Email)) contactLines.Add($"Email: {structured.Contact.Email}"); + if (!string.IsNullOrWhiteSpace(structured.Contact.Location)) contactLines.Add($"Location: {structured.Contact.Location}"); + if (!string.IsNullOrWhiteSpace(structured.Contact.LinkedIn)) contactLines.Add($"LinkedIn: {structured.Contact.LinkedIn}"); + if (contactLines.Count > 0) blocks.Add($"Contact:\n{string.Join("\n", contactLines)}"); + + if (structured.Summary.Count > 0) { - var sections = JsonSerializer.Deserialize>(user.ProfileCvStructureJson); - if (sections is null || sections.Count == 0) return string.Empty; + blocks.Add($"Summary:\n- {string.Join("\n- ", structured.Summary.Take(4))}"); + } - var preferredOrder = new[] + if (structured.Skills.Count > 0) + { + blocks.Add($"Skills:\n{string.Join(", ", structured.Skills.Take(16))}"); + } + + if (structured.Jobs.Count > 0) + { + var jobBlocks = structured.Jobs.Take(3).Select(job => { - "Professional Summary", - "Core Skills", - "Experience Highlights", - "Selected Achievements", - "Projects", - "Education", - "Certifications", - }; - - var ordered = preferredOrder - .Select(name => sections.FirstOrDefault(section => string.Equals(section.Name?.Trim(), name, StringComparison.OrdinalIgnoreCase))) - .Where(section => section is not null) - .Concat(sections.Where(section => !preferredOrder.Contains(section.Name ?? string.Empty, StringComparer.OrdinalIgnoreCase))) - .Where(section => !string.IsNullOrWhiteSpace(section?.Content)) - .Take(6) - .Select(section => $"{section!.Name}:\n{section.Content!.Trim()}") - .ToList(); - - return ordered.Count > 0 - ? $"Structured CV sections:\n{string.Join("\n\n", ordered)}" - : string.Empty; + var header = string.Join(" | ", new[] { job.Title, job.Company, job.Location, FormatStructuredDateRange(job.Start, job.End, job.IsCurrent) }.Where(value => !string.IsNullOrWhiteSpace(value))); + var bullets = job.Bullets.Take(3).Select(bullet => $"- {bullet}"); + return string.Join("\n", new[] { header }.Concat(bullets).Where(value => !string.IsNullOrWhiteSpace(value))); + }).Where(value => !string.IsNullOrWhiteSpace(value)).ToList(); + if (jobBlocks.Count > 0) blocks.Add($"Work Experience:\n{string.Join("\n\n", jobBlocks)}"); } - catch + + if (structured.Education.Count > 0) { - return string.Empty; + var items = structured.Education.Take(3).Select(education => string.Join(" | ", new[] { education.Qualification, education.Institution, education.Location, FormatStructuredDateRange(education.Start, education.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value)))); + blocks.Add($"Education:\n- {string.Join("\n- ", items)}"); } + + if (structured.Languages.Count > 0) + { + var items = structured.Languages.Take(5).Select(language => string.Join(": ", new[] { language.Name, language.Level }.Where(value => !string.IsNullOrWhiteSpace(value)))); + blocks.Add($"Languages:\n- {string.Join("\n- ", items)}"); + } + + if (structured.OtherSections.Count > 0) + { + var items = structured.OtherSections.Take(2) + .Where(section => !string.IsNullOrWhiteSpace(section.Title) && section.Items.Count > 0) + .Select(section => $"{section.Title}: {string.Join("; ", section.Items.Take(4))}") + .ToList(); + if (items.Count > 0) blocks.Add($"Other sections:\n- {string.Join("\n- ", items)}"); + } + + if (blocks.Count == 0 && structured.Sections.Count > 0) + { + blocks.AddRange(structured.Sections.Take(6).Select(section => $"{section.Name}:\n{section.Content}")); + } + + return blocks.Count > 0 + ? $"Structured CV:\n{string.Join("\n\n", blocks)}" + : string.Empty; + } + + private static string BuildCvSearchCorpus(ApplicationUser? user) + { + var structured = StructuredCvProfileJson.Deserialize(user?.ProfileCvStructureJson); + var parts = new List(); + if (!string.IsNullOrWhiteSpace(user?.ProfileCvText)) parts.Add(user.ProfileCvText!); + if (!string.IsNullOrWhiteSpace(structured.Contact.Headline)) parts.Add(structured.Contact.Headline!); + if (structured.Summary.Count > 0) parts.Add(string.Join("\n", structured.Summary)); + if (structured.Skills.Count > 0) parts.Add(string.Join("\n", structured.Skills)); + if (structured.Jobs.Count > 0) + { + parts.Add(string.Join("\n", structured.Jobs.SelectMany(job => new[] { job.Title, job.Company, job.Location }.Where(value => !string.IsNullOrWhiteSpace(value)).Concat(job.Bullets).Concat(job.Skills)))); + } + if (structured.Education.Count > 0) + { + parts.Add(string.Join("\n", structured.Education.SelectMany(education => new[] { education.Qualification, education.Institution, education.Location }.Where(value => !string.IsNullOrWhiteSpace(value)).Concat(education.Details)))); + } + if (structured.Languages.Count > 0) + { + parts.Add(string.Join("\n", structured.Languages.Select(language => string.Join(" ", new[] { language.Name, language.Level, language.Notes }.Where(value => !string.IsNullOrWhiteSpace(value)))))); + } + return string.Join("\n", parts.Where(part => !string.IsNullOrWhiteSpace(part))); + } + + private static string? FormatStructuredDateRange(string? start, string? end, bool isCurrent) + { + if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null; + if (string.IsNullOrWhiteSpace(start)) return end; + return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}"; } private async Task> BuildListFromAiAsync(string instruction, string context, CancellationToken cancellationToken, string fallbackPrefix) @@ -1729,7 +1783,7 @@ namespace JobTrackerApi.Controllers return BadRequest("This job does not have enough description or notes to compare against your CV."); } - var normalizedCv = cvText.ToLowerInvariant(); + var normalizedCv = BuildCvSearchCorpus(user).ToLowerInvariant(); var jobTags = SkillTagger.Detect(jobText).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); var strengths = jobTags.Where(tag => normalizedCv.Contains(tag.ToLowerInvariant())).Take(8).ToList(); var gaps = jobTags.Where(tag => !normalizedCv.Contains(tag.ToLowerInvariant())).Take(8).ToList(); diff --git a/JobTrackerApi/Controllers/ProfileCvController.cs b/JobTrackerApi/Controllers/ProfileCvController.cs index 6123617..05335ce 100644 --- a/JobTrackerApi/Controllers/ProfileCvController.cs +++ b/JobTrackerApi/Controllers/ProfileCvController.cs @@ -26,6 +26,31 @@ public sealed class ProfileCvController : ControllerBase ".webp", }; + private static readonly Dictionary SectionAliases = new(StringComparer.OrdinalIgnoreCase) + { + ["professional summary"] = "Professional Summary", + ["summary"] = "Professional Summary", + ["profile"] = "Professional Summary", + ["about me"] = "Professional Summary", + ["contact"] = "Contact", + ["contact details"] = "Contact", + ["core skills"] = "Skills", + ["skills"] = "Skills", + ["technical skills"] = "Skills", + ["experience"] = "Work Experience", + ["experience highlights"] = "Work Experience", + ["work experience"] = "Work Experience", + ["employment history"] = "Work Experience", + ["selected achievements"] = "Selected Achievements", + ["achievements"] = "Selected Achievements", + ["projects"] = "Projects", + ["education"] = "Education", + ["certifications"] = "Certifications", + ["certificates"] = "Certifications", + ["languages"] = "Languages", + ["interests"] = "Interests", + }; + private const long MaxFileSizeBytes = 5 * 1024 * 1024; private readonly UserManager _users; @@ -39,7 +64,6 @@ public sealed class ProfileCvController : ControllerBase public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole); public sealed record ParseCvRequest(string? Text); - public sealed record ParsedCvSectionDto(string Name, string Content, int WordCount); [HttpPost("upload")] [RequestSizeLimit(MaxFileSizeBytes)] @@ -86,16 +110,18 @@ public sealed class ProfileCvController : ControllerBase return BadRequest("The uploaded CV file could not be read or was empty."); } + text = (await MaybeReconstructStructuredCvAsync(text, HttpContext.RequestAborted)).Trim(); + var structuredCv = await BuildStructuredCvAsync(text, HttpContext.RequestAborted); + user.ProfileCvText = text; - user.ProfileCvStructureJson = JsonSerializer.Serialize( - ParseSections(text).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList()); + user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv); var result = await _users.UpdateAsync(user); if (!result.Succeeded) { return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description))); } - return Ok(new { imported = true, characters = text.Length }); + return Ok(new { imported = true, characters = text.Length, structuredCv, sections = structuredCv.Sections }); } [HttpPost("rebuild")] @@ -117,15 +143,15 @@ public sealed class ProfileCvController : ControllerBase } user.ProfileCvText = rebuilt.Trim(); - user.ProfileCvStructureJson = JsonSerializer.Serialize( - ParseSections(user.ProfileCvText).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList()); + var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted); + user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv); var result = await _users.UpdateAsync(user); if (!result.Succeeded) { return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description))); } - return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText }); + return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections }); } [HttpPost("rewrite-section")] @@ -162,18 +188,15 @@ public sealed class ProfileCvController : ControllerBase var source = string.IsNullOrWhiteSpace(request?.Text) ? user.ProfileCvText : request!.Text; if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections."); - var sections = ParseSections(source) - .Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))) - .ToList(); - - user.ProfileCvStructureJson = JsonSerializer.Serialize(sections); + var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted); + user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv); var update = await _users.UpdateAsync(user); if (!update.Succeeded) { return BadRequest(string.Join("; ", update.Errors.Select(e => e.Description))); } - return Ok(new { sections, totalWords = CountWords(source) }); + return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source) }); } [HttpPost("improve")] @@ -195,15 +218,91 @@ public sealed class ProfileCvController : ControllerBase } user.ProfileCvText = improved.Trim(); - user.ProfileCvStructureJson = JsonSerializer.Serialize( - ParseSections(user.ProfileCvText).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList()); + var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted); + user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv); var result = await _users.UpdateAsync(user); if (!result.Succeeded) { return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description))); } - return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText }); + return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections }); + } + + private async Task BuildStructuredCvAsync(string text, CancellationToken cancellationToken) + { + var fallbackSections = ParseSections(text) + .Select(section => new StructuredCvSection + { + Name = section.Name, + Content = section.Content, + WordCount = CountWords(section.Content), + }) + .ToList(); + + var fallback = StructuredCvProfileJson.FromSections(fallbackSections); + fallback.Contact.FullName ??= GuessFullName(text); + var extracted = await TryExtractStructuredCvAsync(text, cancellationToken); + var merged = StructuredCvProfileJson.Merge(extracted, fallback); + merged.Contact.FullName ??= GuessFullName(text); + return StructuredCvProfileJson.Normalize(merged); + } + + private async Task TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken) + { + var structuredJson = await _aiService.SummarizeSectionAsync( + "Extract this CV into structured JSON. Return only valid JSON with this exact top-level shape: { \"version\": \"1\", \"contact\": { \"fullName\": string|null, \"headline\": string|null, \"email\": string|null, \"phone\": string|null, \"location\": string|null, \"website\": string|null, \"linkedin\": string|null }, \"summary\": string[], \"jobs\": [{ \"title\": string|null, \"company\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"isCurrent\": boolean, \"bullets\": string[], \"skills\": string[] }], \"education\": [{ \"qualification\": string|null, \"institution\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"details\": string[] }], \"skills\": string[], \"languages\": [{ \"name\": string|null, \"level\": string|null, \"notes\": string|null }], \"interests\": string[], \"otherSections\": [{ \"title\": string|null, \"items\": string[] }] }. Preserve facts only. Do not invent anything. If a field is unknown, use null or an empty array. Keep wording close to the source. Put unmatched content in otherSections.", + text, + 3200, + 900); + + if (string.IsNullOrWhiteSpace(structuredJson)) return null; + var extracted = ExtractJsonObject(structuredJson); + if (string.IsNullOrWhiteSpace(extracted)) return null; + + var parsed = StructuredCvProfileJson.Deserialize(extracted); + return IsMeaningfullyStructured(parsed) ? parsed : null; + } + + private static bool IsMeaningfullyStructured(StructuredCvProfile profile) + { + return !string.IsNullOrWhiteSpace(profile.Contact.FullName) + || profile.Summary.Count > 0 + || profile.Jobs.Count > 0 + || profile.Education.Count > 0 + || profile.Skills.Count > 0 + || profile.Languages.Count > 0 + || profile.Interests.Count > 0 + || profile.OtherSections.Count > 0; + } + + private static string? ExtractJsonObject(string raw) + { + var trimmed = raw.Trim(); + if (trimmed.StartsWith("```", StringComparison.Ordinal)) + { + trimmed = Regex.Replace(trimmed, "^```(?:json)?\\s*|\\s*```$", string.Empty, RegexOptions.IgnoreCase); + } + + var start = trimmed.IndexOf('{'); + var end = trimmed.LastIndexOf('}'); + if (start < 0 || end <= start) return null; + return trimmed[start..(end + 1)]; + } + + private static string? GuessFullName(string source) + { + var normalized = source.Replace("\r\n", "\n"); + foreach (var line in normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(6)) + { + var cleaned = line.Trim().TrimStart('#').Trim(); + if (cleaned.Length < 4 || cleaned.Length > 80) continue; + if (cleaned.Contains('@') || Regex.IsMatch(cleaned, @"\d")) continue; + if (!Regex.IsMatch(cleaned, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,4}$")) continue; + return cleaned; + } + + return null; } private static int CountWords(string? text) @@ -215,25 +314,6 @@ public sealed class ProfileCvController : ControllerBase private static List<(string Name, string Content)> ParseSections(string source) { var lines = source.Replace("\r\n", "\n").Split('\n'); - var aliases = new Dictionary(StringComparer.OrdinalIgnoreCase) - { - ["professional summary"] = "Professional Summary", - ["summary"] = "Professional Summary", - ["profile"] = "Professional Summary", - ["core skills"] = "Core Skills", - ["skills"] = "Core Skills", - ["technical skills"] = "Core Skills", - ["experience"] = "Experience Highlights", - ["experience highlights"] = "Experience Highlights", - ["work experience"] = "Experience Highlights", - ["selected achievements"] = "Selected Achievements", - ["achievements"] = "Selected Achievements", - ["projects"] = "Projects", - ["education"] = "Education", - ["certifications"] = "Certifications", - ["certificates"] = "Certifications", - }; - var sections = new List<(string Name, List Lines)>(); var currentName = "General"; var currentLines = new List(); @@ -251,16 +331,11 @@ public sealed class ProfileCvController : ControllerBase foreach (var raw in lines) { var line = raw.Trim(); - var normalized = line.TrimEnd(':').Trim(); - var looksLikeHeading = normalized.Length > 0 - && normalized.Length <= 40 - && !normalized.Contains('.') - && aliases.ContainsKey(normalized.ToLowerInvariant()); - - if (looksLikeHeading) + var canonicalHeading = CanonicalizeSectionHeading(line); + if (canonicalHeading is not null) { Flush(); - currentName = aliases[normalized.ToLowerInvariant()]; + currentName = canonicalHeading; continue; } @@ -280,6 +355,56 @@ public sealed class ProfileCvController : ControllerBase .ToList(); } + private async Task MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken) + { + var normalized = text.Trim(); + if (!LooksLikeFlattenedCvExtraction(normalized)) + { + return normalized; + } + + var reconstructed = await _aiService.SummarizeSectionAsync( + "Reconstruct this CV text extracted from a PDF into a clean, readable master CV in markdown. Preserve facts only. Recover clear sections such as Contact, Professional Summary, Work Experience, Education, Skills, Languages, and Interests when present. Split contact details onto their own lines, turn noisy all-caps/spaced headings into normal headings, keep dates with the correct roles and employers, and remove layout/OCR artifacts. Do not invent employers, titles, dates, or metrics. Return only the reconstructed CV text.", + normalized, + 2800, + 900); + + return string.IsNullOrWhiteSpace(reconstructed) ? normalized : reconstructed.Trim(); + } + + private static bool LooksLikeFlattenedCvExtraction(string text) + { + if (string.IsNullOrWhiteSpace(text)) return false; + + var normalized = text.Replace("\r\n", "\n"); + var lineCount = normalized.Split('\n').Count(line => !string.IsNullOrWhiteSpace(line)); + var spacedHeadingCount = Regex.Matches(normalized, @"\b(?:[A-Z]\s){3,}[A-Z]\b").Count; + var knownHeadingHits = SectionAliases.Keys.Count(alias => normalized.Contains(alias, StringComparison.OrdinalIgnoreCase)); + var bulletCount = Regex.Matches(normalized, @"[•●▪◦]").Count; + + return (lineCount <= 6 && normalized.Length >= 500) + || spacedHeadingCount >= 3 + || (knownHeadingHits >= 3 && lineCount <= 12) + || (normalized.Contains(" + ") && bulletCount > 0 && lineCount <= 10); + } + + private static string? CanonicalizeSectionHeading(string line) + { + if (string.IsNullOrWhiteSpace(line)) return null; + + var normalized = line.Trim(); + if (normalized.StartsWith("#", StringComparison.Ordinal)) + { + normalized = normalized.TrimStart('#').Trim(); + } + + normalized = normalized.TrimEnd(':').Trim(); + if (normalized.Length == 0 || normalized.Length > 60) return null; + if (normalized.Contains('.') || normalized.Contains(" ")) return null; + + return SectionAliases.TryGetValue(normalized, out var canonical) ? canonical : null; + } + private static async Task ExtractTextAsync(IFormFile file, string extension) { if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)) diff --git a/Models/StructuredCvProfile.cs b/Models/StructuredCvProfile.cs new file mode 100644 index 0000000..a390532 --- /dev/null +++ b/Models/StructuredCvProfile.cs @@ -0,0 +1,68 @@ +namespace JobTrackerApi.Models; + +public sealed class StructuredCvProfile +{ + public string Version { get; set; } = "1"; + public StructuredCvContact Contact { get; set; } = new(); + public List Summary { get; set; } = new(); + public List Jobs { get; set; } = new(); + public List Education { get; set; } = new(); + public List Skills { get; set; } = new(); + public List Languages { get; set; } = new(); + public List Interests { get; set; } = new(); + public List OtherSections { get; set; } = new(); + public List Sections { get; set; } = new(); +} + +public sealed class StructuredCvContact +{ + public string? FullName { get; set; } + public string? Headline { get; set; } + public string? Email { get; set; } + public string? Phone { get; set; } + public string? Location { get; set; } + public string? Website { get; set; } + public string? LinkedIn { get; set; } +} + +public sealed class StructuredCvJob +{ + public string? Title { get; set; } + public string? Company { get; set; } + public string? Location { get; set; } + public string? Start { get; set; } + public string? End { get; set; } + public bool IsCurrent { get; set; } + public List Bullets { get; set; } = new(); + public List Skills { get; set; } = new(); +} + +public sealed class StructuredCvEducation +{ + public string? Qualification { get; set; } + public string? Institution { get; set; } + public string? Location { get; set; } + public string? Start { get; set; } + public string? End { get; set; } + public List Details { get; set; } = new(); +} + +public sealed class StructuredCvLanguage +{ + public string? Name { get; set; } + public string? Level { get; set; } + public string? Notes { get; set; } +} + +public sealed class StructuredCvOtherSection +{ + public string? Title { get; set; } + public List Items { get; set; } = new(); +} + +public sealed class StructuredCvSection +{ + public string Name { get; set; } = string.Empty; + public string Content { get; set; } = string.Empty; + public int WordCount { get; set; } +} diff --git a/Models/StructuredCvProfileJson.cs b/Models/StructuredCvProfileJson.cs new file mode 100644 index 0000000..3d8fcee --- /dev/null +++ b/Models/StructuredCvProfileJson.cs @@ -0,0 +1,491 @@ +using System.Text.Json; +using System.Text.Json.Serialization; +using System.Text.RegularExpressions; + +namespace JobTrackerApi.Models; + +public static class StructuredCvProfileJson +{ + private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web) + { + PropertyNameCaseInsensitive = true, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + }; + + public static StructuredCvProfile Empty() => Normalize(new StructuredCvProfile()); + + public static StructuredCvProfile Deserialize(string? json) + { + if (string.IsNullOrWhiteSpace(json)) return Empty(); + + try + { + using var doc = JsonDocument.Parse(json); + if (doc.RootElement.ValueKind == JsonValueKind.Array) + { + var sections = JsonSerializer.Deserialize>(json, SerializerOptions) ?? new List(); + return FromSections(sections); + } + + if (doc.RootElement.ValueKind != JsonValueKind.Object) return Empty(); + var profile = JsonSerializer.Deserialize(json, SerializerOptions) ?? new StructuredCvProfile(); + return Normalize(profile); + } + catch + { + return Empty(); + } + } + + public static string Serialize(StructuredCvProfile? profile) + { + return JsonSerializer.Serialize(Normalize(profile), SerializerOptions); + } + + public static StructuredCvProfile Merge(StructuredCvProfile? preferred, StructuredCvProfile? fallback) + { + var primary = Normalize(preferred); + var secondary = Normalize(fallback); + + primary.Contact.FullName ??= secondary.Contact.FullName; + primary.Contact.Headline ??= secondary.Contact.Headline; + primary.Contact.Email ??= secondary.Contact.Email; + primary.Contact.Phone ??= secondary.Contact.Phone; + primary.Contact.Location ??= secondary.Contact.Location; + primary.Contact.Website ??= secondary.Contact.Website; + primary.Contact.LinkedIn ??= secondary.Contact.LinkedIn; + + if (primary.Summary.Count == 0) primary.Summary = secondary.Summary; + if (primary.Jobs.Count == 0) primary.Jobs = secondary.Jobs; + if (primary.Education.Count == 0) primary.Education = secondary.Education; + if (primary.Skills.Count == 0) primary.Skills = secondary.Skills; + if (primary.Languages.Count == 0) primary.Languages = secondary.Languages; + if (primary.Interests.Count == 0) primary.Interests = secondary.Interests; + if (primary.OtherSections.Count == 0) primary.OtherSections = secondary.OtherSections; + if (primary.Sections.Count == 0) primary.Sections = secondary.Sections; + + return Normalize(primary); + } + + public static StructuredCvProfile FromSections(IEnumerable? sections) + { + var normalizedSections = NormalizeSections(sections); + var profile = new StructuredCvProfile + { + Sections = normalizedSections, + }; + + foreach (var section in normalizedSections) + { + switch (section.Name.Trim().ToLowerInvariant()) + { + case "contact": + ApplyContact(profile.Contact, section.Content); + break; + case "professional summary": + case "summary": + profile.Summary = SplitList(section.Content); + break; + case "skills": + case "core skills": + case "technical skills": + profile.Skills = SplitList(section.Content); + break; + case "languages": + profile.Languages = ParseLanguages(section.Content); + break; + case "interests": + profile.Interests = SplitList(section.Content); + break; + case "work experience": + case "experience": + case "employment history": + profile.Jobs = ParseJobs(section.Content); + break; + case "education": + profile.Education = ParseEducation(section.Content); + break; + default: + profile.OtherSections.Add(new StructuredCvOtherSection + { + Title = section.Name, + Items = SplitList(section.Content), + }); + break; + } + } + + return Normalize(profile); + } + + public static StructuredCvProfile Normalize(StructuredCvProfile? profile) + { + profile ??= new StructuredCvProfile(); + profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim(); + profile.Contact ??= new StructuredCvContact(); + profile.Summary = CleanList(profile.Summary); + profile.Jobs = (profile.Jobs ?? new List()) + .Select(NormalizeJob) + .Where(job => !string.IsNullOrWhiteSpace(job.Title) + || !string.IsNullOrWhiteSpace(job.Company) + || job.Bullets.Count > 0) + .ToList(); + profile.Education = (profile.Education ?? new List()) + .Select(NormalizeEducation) + .Where(education => !string.IsNullOrWhiteSpace(education.Qualification) + || !string.IsNullOrWhiteSpace(education.Institution) + || education.Details.Count > 0) + .ToList(); + profile.Skills = CleanList(profile.Skills); + profile.Languages = (profile.Languages ?? new List()) + .Select(NormalizeLanguage) + .Where(language => !string.IsNullOrWhiteSpace(language.Name)) + .ToList(); + profile.Interests = CleanList(profile.Interests); + profile.OtherSections = (profile.OtherSections ?? new List()) + .Select(section => new StructuredCvOtherSection + { + Title = TrimOrNull(section?.Title), + Items = CleanList(section?.Items), + }) + .Where(section => !string.IsNullOrWhiteSpace(section.Title) || section.Items.Count > 0) + .ToList(); + + var normalizedSections = NormalizeSections(profile.Sections); + profile.Sections = normalizedSections.Count > 0 ? normalizedSections : BuildSections(profile); + return profile; + } + + private static StructuredCvJob NormalizeJob(StructuredCvJob? job) + { + job ??= new StructuredCvJob(); + job.Title = TrimOrNull(job.Title); + job.Company = TrimOrNull(job.Company); + job.Location = TrimOrNull(job.Location); + job.Start = TrimOrNull(job.Start); + job.End = TrimOrNull(job.End); + job.Bullets = CleanList(job.Bullets); + job.Skills = CleanList(job.Skills); + job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase); + return job; + } + + private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education) + { + education ??= new StructuredCvEducation(); + education.Qualification = TrimOrNull(education.Qualification); + education.Institution = TrimOrNull(education.Institution); + education.Location = TrimOrNull(education.Location); + education.Start = TrimOrNull(education.Start); + education.End = TrimOrNull(education.End); + education.Details = CleanList(education.Details); + return education; + } + + private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language) + { + language ??= new StructuredCvLanguage(); + language.Name = TrimOrNull(language.Name); + language.Level = TrimOrNull(language.Level); + language.Notes = TrimOrNull(language.Notes); + return language; + } + + private static List NormalizeSections(IEnumerable? sections) + { + return (sections ?? Array.Empty()) + .Select(section => new StructuredCvSection + { + Name = string.IsNullOrWhiteSpace(section?.Name) ? "General" : section.Name.Trim(), + Content = section?.Content?.Trim() ?? string.Empty, + WordCount = section?.WordCount is > 0 ? section.WordCount : CountWords(section?.Content), + }) + .Where(section => !string.IsNullOrWhiteSpace(section.Content)) + .ToList(); + } + + private static List BuildSections(StructuredCvProfile profile) + { + var sections = new List(); + + var contactLines = new List(); + AddIf(contactLines, profile.Contact.FullName); + AddIf(contactLines, profile.Contact.Headline); + AddIf(contactLines, profile.Contact.Email); + AddIf(contactLines, profile.Contact.Phone); + AddIf(contactLines, profile.Contact.Location); + AddIf(contactLines, profile.Contact.Website); + AddIf(contactLines, profile.Contact.LinkedIn); + AddSectionIfAny(sections, "Contact", contactLines); + AddSectionIfAny(sections, "Professional Summary", profile.Summary); + + if (profile.Jobs.Count > 0) + { + var lines = new List(); + foreach (var job in profile.Jobs) + { + AddIf(lines, $"### {job.Title}".Trim()); + var meta = string.Join(" | ", new[] { job.Company, job.Location, FormatDateRange(job.Start, job.End, job.IsCurrent) }.Where(value => !string.IsNullOrWhiteSpace(value))); + AddIf(lines, meta); + lines.AddRange(job.Bullets.Select(bullet => $"- {bullet}")); + if (job.Skills.Count > 0) + { + lines.Add($"Skills: {string.Join(", ", job.Skills)}"); + } + if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty); + } + AddSectionIfAny(sections, "Work Experience", lines); + } + + if (profile.Education.Count > 0) + { + var lines = new List(); + foreach (var education in profile.Education) + { + AddIf(lines, $"### {education.Qualification}".Trim()); + var meta = string.Join(" | ", new[] { education.Institution, education.Location, FormatDateRange(education.Start, education.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value))); + AddIf(lines, meta); + lines.AddRange(education.Details.Select(detail => $"- {detail}")); + if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty); + } + AddSectionIfAny(sections, "Education", lines); + } + + AddSectionIfAny(sections, "Skills", profile.Skills); + + if (profile.Languages.Count > 0) + { + AddSectionIfAny(sections, "Languages", profile.Languages.Select(language => + { + var value = language.Name ?? string.Empty; + if (!string.IsNullOrWhiteSpace(language.Level)) value += $": {language.Level}"; + if (!string.IsNullOrWhiteSpace(language.Notes)) value += $" ({language.Notes})"; + return value; + }).ToList()); + } + + AddSectionIfAny(sections, "Interests", profile.Interests); + + foreach (var other in profile.OtherSections) + { + AddSectionIfAny(sections, other.Title ?? "Other", other.Items); + } + + return NormalizeSections(sections); + } + + private static void AddSectionIfAny(List sections, string name, IEnumerable? lines) + { + var content = string.Join("\n", (lines ?? Array.Empty()).Where(line => !string.IsNullOrWhiteSpace(line)).Select(line => line.Trim())).Trim(); + if (string.IsNullOrWhiteSpace(content)) return; + sections.Add(new StructuredCvSection { Name = name, Content = content, WordCount = CountWords(content) }); + } + + private static void AddIf(List lines, string? value) + { + if (!string.IsNullOrWhiteSpace(value)) lines.Add(value.Trim()); + } + + private static void ApplyContact(StructuredCvContact contact, string content) + { + var lines = content.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); + contact.Email ??= Regex.Match(content, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value.NullIfWhitespace(); + contact.Phone ??= Regex.Match(content, @"(? !line.Contains('@') && !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase)).ToList(); + if (leftovers.Count > 0) contact.FullName ??= leftovers[0].Trim(); + if (leftovers.Count > 1) contact.Headline ??= leftovers[1].Trim(); + if (leftovers.Count > 2) contact.Location ??= leftovers[2].Trim(); + } + + private static List ParseLanguages(string content) + { + return SplitList(content) + .Select(item => + { + var name = item; + string? level = null; + string? notes = null; + + var colonIndex = item.IndexOf(':'); + if (colonIndex > 0) + { + name = item[..colonIndex].Trim(); + var remainder = item[(colonIndex + 1)..].Trim(); + var noteMatch = Regex.Match(remainder, @"^(.*?)\s*\((.*?)\)$"); + if (noteMatch.Success) + { + level = noteMatch.Groups[1].Value.NullIfWhitespace(); + notes = noteMatch.Groups[2].Value.NullIfWhitespace(); + } + else + { + level = remainder.NullIfWhitespace(); + } + } + + return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes }; + }) + .Where(language => !string.IsNullOrWhiteSpace(language.Name)) + .ToList(); + } + + private static List ParseJobs(string content) + { + var blocks = SplitBlocks(content); + return blocks.Select(ParseJobBlock).Where(job => job is not null).Select(job => job!).ToList(); + } + + private static StructuredCvJob? ParseJobBlock(string block) + { + var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList(); + if (lines.Count == 0) return null; + + var job = new StructuredCvJob(); + if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' '); + job.Title = lines[0].NullIfWhitespace(); + + var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList(); + var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)(?:\s*[-–]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null); + if (!string.IsNullOrWhiteSpace(dateValue)) + { + var parts = Regex.Split(dateValue, "\\s*[-–]\\s*"); + job.Start = parts.FirstOrDefault().NullIfWhitespace(); + job.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace(); + job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase); + } + + var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList(); + if (metadataWithoutDates.Count > 0) job.Company = metadataWithoutDates[0].NullIfWhitespace(); + if (metadataWithoutDates.Count > 1) job.Location = metadataWithoutDates[1].NullIfWhitespace(); + + job.Bullets = lines.Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList(); + job.Skills = lines + .Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) + .SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..])) + .ToList(); + + return string.IsNullOrWhiteSpace(job.Title) && string.IsNullOrWhiteSpace(job.Company) && job.Bullets.Count == 0 ? null : job; + } + + private static List ParseEducation(string content) + { + var blocks = SplitBlocks(content); + return blocks.Select(ParseEducationBlock).Where(education => education is not null).Select(education => education!).ToList(); + } + + private static StructuredCvEducation? ParseEducationBlock(string block) + { + var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList(); + if (lines.Count == 0) return null; + + var education = new StructuredCvEducation(); + if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' '); + education.Qualification = lines[0].NullIfWhitespace(); + + var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList(); + var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4})(?:\s*[-–]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null); + if (!string.IsNullOrWhiteSpace(dateValue)) + { + var parts = Regex.Split(dateValue, "\\s*[-–]\\s*"); + education.Start = parts.FirstOrDefault().NullIfWhitespace(); + education.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace(); + } + + var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList(); + if (metadataWithoutDates.Count > 0) education.Institution = metadataWithoutDates[0].NullIfWhitespace(); + if (metadataWithoutDates.Count > 1) education.Location = metadataWithoutDates[1].NullIfWhitespace(); + + education.Details = lines.Skip(1).Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList(); + return string.IsNullOrWhiteSpace(education.Qualification) && string.IsNullOrWhiteSpace(education.Institution) && education.Details.Count == 0 ? null : education; + } + + private static List SplitBlocks(string content) + { + var normalized = content.Replace("\r\n", "\n").Trim(); + if (string.IsNullOrWhiteSpace(normalized)) return new List(); + + if (normalized.Contains("### ", StringComparison.Ordinal)) + { + return Regex.Split(normalized, @"(?=^###\s+)" , RegexOptions.Multiline) + .Select(block => block.Trim()) + .Where(block => !string.IsNullOrWhiteSpace(block)) + .ToList(); + } + + return Regex.Split(normalized, @"\n\s*\n") + .Select(block => block.Trim()) + .Where(block => !string.IsNullOrWhiteSpace(block)) + .ToList(); + } + + private static bool IsBullet(string value) + { + var trimmed = value.TrimStart(); + return trimmed.StartsWith("-", StringComparison.Ordinal) + || trimmed.StartsWith("•", StringComparison.Ordinal) + || trimmed.StartsWith("*", StringComparison.Ordinal); + } + + private static List SplitList(string? content) + { + if (string.IsNullOrWhiteSpace(content)) return new List(); + + return content + .Replace("\r\n", "\n") + .Split('\n', StringSplitOptions.RemoveEmptyEntries) + .SelectMany(line => line.Contains(',') && !line.TrimStart().StartsWith("-", StringComparison.Ordinal) + ? line.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) + : new[] { line }) + .Select(item => item.Trim().TrimStart('-', '•', '*', ' ')) + .Where(item => !string.IsNullOrWhiteSpace(item)) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + private static List CleanList(IEnumerable? values) + { + return (values ?? Array.Empty()) + .Select(value => value?.Trim() ?? string.Empty) + .Where(value => !string.IsNullOrWhiteSpace(value)) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + private static int CountWords(string? content) + { + if (string.IsNullOrWhiteSpace(content)) return 0; + return content.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length; + } + + private static string? TrimOrNull(string? value) + { + return string.IsNullOrWhiteSpace(value) ? null : value.Trim(); + } + + private static string? FormatDateRange(string? start, string? end, bool isCurrent) + { + if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null; + if (string.IsNullOrWhiteSpace(start)) return end; + return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}"; + } + + private static string? NullIfWhitespace(this string? value) + { + return string.IsNullOrWhiteSpace(value) ? null : value.Trim(); + } +}