diff --git a/JobTrackerApi.Tests/ProfileCvControllerTests.cs b/JobTrackerApi.Tests/ProfileCvControllerTests.cs index 34f62c4..51f7635 100644 --- a/JobTrackerApi.Tests/ProfileCvControllerTests.cs +++ b/JobTrackerApi.Tests/ProfileCvControllerTests.cs @@ -617,9 +617,47 @@ public sealed class ProfileCvControllerTests Assert.Contains("Atlas Systems", matchedJob!.Company ?? string.Empty, StringComparison.Ordinal); Assert.Contains("Python", structured.Skills); Assert.Contains("SQL", structured.Skills); + Assert.Equal("classifier", structured.Metadata.Fields["jobs[0].title"].Method); + Assert.Equal("block-1", structured.Metadata.Fields["jobs[0].title"].SourceBlockId); classifier.Verify(x => x.ClassifyBlockAsync(It.IsAny(), It.IsAny()), Times.AtLeastOnce()); } + [Fact] + public async Task Parse_uses_classifier_fallback_for_education_blocks_without_real_sections() + { + var source = "BSc Computer Science\nUniversity of Oslo\nOslo\n2016 - 2019\nGraduated with focus on distributed systems."; + var user = new ApplicationUser { Id = "user-1", ProfileCvText = source }; + var userManager = CreateUserManager(); + userManager.Setup(x => x.GetUserAsync(It.IsAny())).ReturnsAsync(user); + userManager.Setup(x => x.UpdateAsync(user)).ReturnsAsync(IdentityResult.Success); + var aiService = new Mock(); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), source, 3200, 900)) + .ReturnsAsync("not-json"); + + var classifier = new Mock(); + classifier + .Setup(x => x.ClassifyBlockAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new CvBlockClassificationResult("Education", 0.87, "education block", "BSc Computer Science", "University of Oslo", "Oslo", "2016", "2019", new List { "Graduated with focus on distributed systems." })); + + await using var db = CreateDb(); + var paths = CreatePaths(); + var controller = CreateController(userManager.Object, aiService.Object, db, paths, classifier.Object); + + var result = await controller.Parse(new ProfileCvController.ParseCvRequest(source)); + + var ok = Assert.IsType(result.Result); + var json = JsonSerializer.Serialize(ok.Value); + Assert.Contains("BSc Computer Science", json); + Assert.Contains("University of Oslo", json); + + var structured = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson); + Assert.Single(structured.Education); + Assert.Equal("BSc Computer Science", structured.Education[0].Qualification); + Assert.Equal("University of Oslo", structured.Education[0].Institution); + Assert.Equal("classifier", structured.Metadata.Fields["education[0].qualification"].Method); + } + [Fact] public async Task Parse_keeps_general_fallback_when_classifier_returns_nothing() { diff --git a/JobTrackerApi/Controllers/ProfileCvController.cs b/JobTrackerApi/Controllers/ProfileCvController.cs index e5259a0..94abc86 100644 --- a/JobTrackerApi/Controllers/ProfileCvController.cs +++ b/JobTrackerApi/Controllers/ProfileCvController.cs @@ -78,6 +78,7 @@ public sealed class ProfileCvController : ControllerBase public sealed record ParseCvRequest(string? Text); private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv); + private sealed record ClassifiedCvBlock(int Index, string OriginalBlock, string SectionName, string Content, CvBlockClassificationResult? Classification); public sealed record CvExtractionRunListItem( int Id, string Trigger, @@ -340,7 +341,30 @@ public sealed class ProfileCvController : ControllerBase private async Task BuildStructuredCvAsync(string text, CancellationToken cancellationToken) { var parseSource = NormalizeTextForStructuredParsing(text); - var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken); + var parsedSections = ParseSections(parseSource) + .Select(section => new StructuredCvSection + { + Name = section.Name, + Content = section.Content, + WordCount = CountWords(section.Content), + }) + .ToList(); + var hasRealSections = parsedSections.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase)); + + List classifiedBlocks = new(); + List fallbackSections = parsedSections; + StructuredCvProfile? classifierFallback = null; + + if (!hasRealSections) + { + classifiedBlocks = await ClassifyBlocksAsync(parseSource, cancellationToken); + var hasMeaningfulClassifierStructure = classifiedBlocks.Any(block => !string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase)); + if (hasMeaningfulClassifierStructure) + { + fallbackSections = BuildSectionsFromClassifiedBlocks(classifiedBlocks); + classifierFallback = BuildStructuredCvFromClassifiedBlocks(classifiedBlocks); + } + } var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections); AnnotateStructuredCv(sectionFallback, "repair", 0.56); @@ -348,6 +372,10 @@ public sealed class ProfileCvController : ControllerBase AnnotateStructuredCv(heuristicFallback, "deterministic", 0.68); heuristicFallback.Sections = new List(); var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback); + if (classifierFallback is not null) + { + fallback = StructuredCvProfileJson.Merge(classifierFallback, fallback); + } fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email); var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken); var merged = StructuredCvProfileJson.Merge(extracted, fallback); @@ -874,36 +902,202 @@ public sealed class ProfileCvController : ControllerBase .ToList(); } - private async Task> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken) + private static List BuildSectionsFromClassifiedBlocks(List classifiedBlocks) { - var parsed = ParseSections(parseSource) - .Select(section => new StructuredCvSection + var sectionBuckets = new List(); + foreach (var block in classifiedBlocks) + { + var existing = sectionBuckets.FirstOrDefault(section => section.Name == block.SectionName); + if (existing is null) { - Name = section.Name, - Content = section.Content, - WordCount = CountWords(section.Content), - }) - .ToList(); + sectionBuckets.Add(new StructuredCvSection { Name = block.SectionName, Content = block.Content, WordCount = CountWords(block.Content) }); + } + else + { + existing.Content = $"{existing.Content}\n\n{block.Content}".Trim(); + existing.WordCount = CountWords(existing.Content); + } + } - var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase)); - if (hasRealSections) return parsed; - - var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken); - return aiSections.Count > 0 ? aiSections : parsed; + return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList(); } - private async Task> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken) + private static StructuredCvProfile BuildStructuredCvFromClassifiedBlocks(List classifiedBlocks) + { + var profile = new StructuredCvProfile(); + var now = DateTimeOffset.UtcNow; + var summary = new List(); + var skills = new HashSet(StringComparer.OrdinalIgnoreCase); + + foreach (var block in classifiedBlocks) + { + switch (block.SectionName) + { + case "Professional Summary": + foreach (var item in SplitClassifierContent(block.Content, 5)) + { + summary.Add(item); + } + ApplyClassifierFieldMetadata(profile, "summary", summary.FirstOrDefault(), block, now); + break; + case "Skills": + foreach (var item in SplitClassifierSkills(block.Content)) + { + skills.Add(item); + } + ApplyClassifierFieldMetadata(profile, "skills", skills.FirstOrDefault(), block, now); + break; + case "Work Experience": + var job = BuildJobFromClassifiedBlock(block); + if (job is not null) + { + var index = profile.Jobs.Count; + profile.Jobs.Add(job); + ApplyClassifierFieldMetadata(profile, $"jobs[{index}].title", job.Title, block, now); + ApplyClassifierFieldMetadata(profile, $"jobs[{index}].company", job.Company, block, now); + ApplyClassifierFieldMetadata(profile, $"jobs[{index}].location", job.Location, block, now); + } + break; + case "Education": + var education = BuildEducationFromClassifiedBlock(block); + if (education is not null) + { + var index = profile.Education.Count; + profile.Education.Add(education); + ApplyClassifierFieldMetadata(profile, $"education[{index}].qualification", education.Qualification, block, now); + ApplyClassifierFieldMetadata(profile, $"education[{index}].institution", education.Institution, block, now); + } + break; + default: + if (!string.IsNullOrWhiteSpace(block.Content)) + { + profile.OtherSections.Add(new StructuredCvOtherSection + { + Title = block.SectionName, + Items = SplitClassifierContent(block.Content, 6) + }); + } + break; + } + } + + profile.Summary = summary.Distinct(StringComparer.OrdinalIgnoreCase).ToList(); + profile.Skills = skills.ToList(); + profile.Sections = BuildSectionsFromClassifiedBlocks(classifiedBlocks); + + var averageConfidence = classifiedBlocks + .Select(block => block.Classification?.Confidence) + .Where(value => value.HasValue) + .Select(value => value!.Value) + .DefaultIfEmpty(0.74) + .Average(); + AnnotateStructuredCv(profile, "classifier", averageConfidence); + return StructuredCvProfileJson.Normalize(profile); + } + + private static StructuredCvJob? BuildJobFromClassifiedBlock(ClassifiedCvBlock block) + { + var classification = block.Classification; + if (classification is null) return null; + + var bullets = classification.Bullets is { Count: > 0 } + ? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList() + : SplitClassifierContent(block.OriginalBlock, 6); + + var job = new StructuredCvJob + { + Title = NullIfWhitespace(classification.Title), + Company = NullIfWhitespace(classification.Company), + Location = NullIfWhitespace(classification.Location), + Start = NullIfWhitespace(classification.Start), + End = NullIfWhitespace(classification.End), + IsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase), + Bullets = bullets, + Skills = SplitClassifierSkills(block.OriginalBlock) + }; + + return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Jobs = new List { job } }).Jobs.FirstOrDefault(); + } + + private static StructuredCvEducation? BuildEducationFromClassifiedBlock(ClassifiedCvBlock block) + { + var classification = block.Classification; + if (classification is null) return null; + + var education = new StructuredCvEducation + { + Qualification = NullIfWhitespace(classification.Title), + Institution = NullIfWhitespace(classification.Company), + Location = NullIfWhitespace(classification.Location), + Start = NullIfWhitespace(classification.Start), + End = NullIfWhitespace(classification.End), + Details = classification.Bullets is { Count: > 0 } + ? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList() + : SplitClassifierContent(block.OriginalBlock, 5) + }; + + return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Education = new List { education } }).Education.FirstOrDefault(); + } + + private static List SplitClassifierContent(string content, int limit) + { + return content + .Replace("\r\n", "\n") + .Split(new[] { '\n', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) + .SelectMany(line => line.Contains(". ", StringComparison.Ordinal) + ? Regex.Split(line, @"(?<=[.!?])\s+") + : new[] { line }) + .Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' ')) + .Where(item => item.Length > 2) + .Take(limit) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + private static List SplitClassifierSkills(string content) + { + return content + .Replace("\r\n", "\n") + .Split(new[] { '\n', ',', ';', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) + .Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' ')) + .Where(item => item.Length > 1 && item.Length <= 48 && !LooksLikeDateLikeValue(item) && !item.Contains('@')) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + private static bool LooksLikeDateLikeValue(string value) + { + return Regex.IsMatch(value, @"^(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current))?$", RegexOptions.IgnoreCase); + } + + private static void ApplyClassifierFieldMetadata(StructuredCvProfile profile, string key, string? value, ClassifiedCvBlock block, DateTimeOffset now) + { + if (string.IsNullOrWhiteSpace(value)) return; + + profile.Metadata.Fields[key] = new StructuredCvFieldMetadata + { + Confidence = block.Classification?.Confidence ?? 0.74, + Method = "classifier", + SourceSnippet = block.OriginalBlock.Length > 180 ? block.OriginalBlock[..180] : block.OriginalBlock, + SourceBlockId = $"block-{block.Index}", + ReviewState = string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase) ? "needs-review" : "suggested", + LastUpdatedAtUtc = now, + }; + } + + private async Task> ClassifyBlocksAsync(string parseSource, CancellationToken cancellationToken) { var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n") .Select(block => block.Trim()) .Where(block => block.Length >= 24) .ToList(); - if (blocks.Count == 0) return new List(); + if (blocks.Count == 0) return new List(); - var sectionBuckets = new List(); - foreach (var block in blocks) + var results = new List(); + for (var index = 0; index < blocks.Count; index++) { + var block = blocks[index]; var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken); var sectionName = classification?.Section; if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical)) @@ -931,20 +1125,33 @@ public sealed class ProfileCvController : ControllerBase } if (lines.Count > 0) content = string.Join("\n", lines); } + else if (string.Equals(sectionName, "Education", StringComparison.OrdinalIgnoreCase) && classification is not null) + { + var lines = new List(); + if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}"); + var dateRange = FormatDateRangeForSection(classification.Start, classification.End, false); + var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value))); + if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta); + if (classification.Bullets is not null) + { + lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}")); + } + if (lines.Count > 0) content = string.Join("\n", lines); + } + else if (string.Equals(sectionName, "Skills", StringComparison.OrdinalIgnoreCase)) + { + var items = SplitClassifierSkills(block); + if (items.Count > 0) content = string.Join("\n", items); + } + else if (string.Equals(sectionName, "Professional Summary", StringComparison.OrdinalIgnoreCase) && classification?.Bullets is { Count: > 0 }) + { + content = string.Join("\n", classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}")); + } - var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName); - if (existing is null) - { - sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) }); - } - else - { - existing.Content = $"{existing.Content}\n\n{content}".Trim(); - existing.WordCount = CountWords(existing.Content); - } + results.Add(new ClassifiedCvBlock(index + 1, block, sectionName, content, classification)); } - return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList(); + return results; } private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)