diff --git a/JobTrackerApi.Tests/ProfileCvControllerTests.cs b/JobTrackerApi.Tests/ProfileCvControllerTests.cs index 76ba7a6..095d596 100644 --- a/JobTrackerApi.Tests/ProfileCvControllerTests.cs +++ b/JobTrackerApi.Tests/ProfileCvControllerTests.cs @@ -128,6 +128,53 @@ public sealed class ProfileCvControllerTests Assert.Contains(structured.Sections, section => section.Name == "Education"); } + [Fact] + public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid() + { + var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1."; + + var user = new ApplicationUser(); + var userManager = CreateUserManager(); + userManager.Setup(x => x.GetUserAsync(It.IsAny())).ReturnsAsync(user); + userManager.Setup(x => x.UpdateAsync(user)).ReturnsAsync(IdentityResult.Success); + var aiService = new Mock(); + aiService + .Setup(x => x.ExtractTextAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) + .ReturnsAsync(new AiTextExtractionResult(rawExtraction, false, "application/pdf", 1, rawExtraction.Length, "Resume.en.pdf")); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), rawExtraction, 2800, 900)) + .ReturnsAsync(string.Empty); + aiService + .Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny(), 3200, 900)) + .ReturnsAsync("not-json"); + + var controller = new ProfileCvController(userManager.Object, aiService.Object) + { + ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() } + }; + + var bytes = Encoding.UTF8.GetBytes("fake pdf bytes"); + var file = new FormFile(new MemoryStream(bytes), 0, bytes.Length, "file", "Resume.en.pdf") + { + Headers = new HeaderDictionary(), + ContentType = "application/pdf" + }; + + var result = await controller.Upload(file); + + Assert.IsType(result); + var structured = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson); + Assert.Equal("Connor Babbington", structured.Contact.FullName); + Assert.Equal("connor.babbington@cesnimda.co.uk", structured.Contact.Email); + Assert.Equal("+47 41 33 44 70", structured.Contact.Phone); + Assert.Contains(structured.Summary, item => item.Contains("eight years of experience", StringComparison.OrdinalIgnoreCase)); + Assert.Contains(structured.Skills, item => item.Equals("C#", StringComparison.OrdinalIgnoreCase)); + Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase)); + Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase)); + Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase)); + Assert.DoesNotContain(structured.Sections, section => section.Name == "General"); + } + [Fact] public async Task Parse_returns_structured_cv_and_persists_it() { diff --git a/JobTrackerApi/Controllers/ProfileCvController.cs b/JobTrackerApi/Controllers/ProfileCvController.cs index 05335ce..eaba01d 100644 --- a/JobTrackerApi/Controllers/ProfileCvController.cs +++ b/JobTrackerApi/Controllers/ProfileCvController.cs @@ -231,7 +231,8 @@ public sealed class ProfileCvController : ControllerBase private async Task BuildStructuredCvAsync(string text, CancellationToken cancellationToken) { - var fallbackSections = ParseSections(text) + var parseSource = NormalizeTextForStructuredParsing(text); + var fallbackSections = ParseSections(parseSource) .Select(section => new StructuredCvSection { Name = section.Name, @@ -240,11 +241,14 @@ public sealed class ProfileCvController : ControllerBase }) .ToList(); - var fallback = StructuredCvProfileJson.FromSections(fallbackSections); - fallback.Contact.FullName ??= GuessFullName(text); - var extracted = await TryExtractStructuredCvAsync(text, cancellationToken); + var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections); + var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text); + heuristicFallback.Sections = new List(); + var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback); + fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email); + var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken); var merged = StructuredCvProfileJson.Merge(extracted, fallback); - merged.Contact.FullName ??= GuessFullName(text); + merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email); return StructuredCvProfileJson.Normalize(merged); } @@ -305,12 +309,265 @@ public sealed class ProfileCvController : ControllerBase return null; } + private static string? GuessFullNameFromEmail(string? email) + { + if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null; + var localPart = email[..email.IndexOf('@')].Trim(); + if (string.IsNullOrWhiteSpace(localPart)) return null; + var parts = Regex.Split(localPart, @"[._-]+") + .Select(part => part.Trim()) + .Where(part => part.Length > 0) + .Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant()) + .ToList(); + return parts.Count >= 2 ? string.Join(" ", parts) : null; + } + + private static string NormalizeTextForStructuredParsing(string source) + { + if (string.IsNullOrWhiteSpace(source)) return string.Empty; + + var text = source.Replace("\r\n", "\n").Trim(); + if (!LooksLikeFlattenedCvExtraction(text)) return text; + + text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match => + { + var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty); + foreach (var alias in SectionAliases) + { + var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty); + if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase)) + { + return $"\n\n## {alias.Value}\n"; + } + } + + return match.Value; + }); + + foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length)) + { + text = Regex.Replace( + text, + $@"(? section.Name == "Professional Summary"); + var flattenedSummary = Regex.Match( + rawSource, + @"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))", + RegexOptions.IgnoreCase | RegexOptions.Singleline); + if (flattenedSummary.Success) + { + profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5); + } + else if (!string.IsNullOrWhiteSpace(summarySection.Content)) + { + profile.Summary = SplitSentences(summarySection.Content, 5); + } + + var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests"); + if (!string.IsNullOrWhiteSpace(interestsSection.Content)) + { + profile.Interests = SplitListLike(interestsSection.Content); + } + else + { + var flattenedInterests = Regex.Match( + rawSource, + @"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))", + RegexOptions.IgnoreCase | RegexOptions.Singleline); + if (flattenedInterests.Success) + { + profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4); + } + } + + var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages"); + if (!string.IsNullOrWhiteSpace(languagesSection.Content)) + { + profile.Languages = ParseLanguagesHeuristically(languagesSection.Content); + } + else + { + profile.Languages = ParseLanguagesHeuristically(rawSource); + } + + var skills = new HashSet(StringComparer.OrdinalIgnoreCase); + foreach (Match match in Regex.Matches(rawSource, @"(? section.Name == "Education"); + if (!string.IsNullOrWhiteSpace(educationSection.Content)) + { + profile.Education = ParseEducationHeuristically(educationSection.Content); + } + + var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience"); + if (!string.IsNullOrWhiteSpace(experienceSection.Content)) + { + profile.Jobs = ParseJobsHeuristically(experienceSection.Content); + } + + if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General")) + { + var general = sections.First(section => section.Name == "General"); + if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0) + { + profile.Summary = SplitSentences(general.Content, 3); + } + } + + return StructuredCvProfileJson.Normalize(profile); + } + + private static List SplitSentences(string content, int limit) + { + return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+") + .Select(value => value.Trim()) + .Where(value => value.Length > 20) + .Take(limit) + .ToList(); + } + + private static List SplitListLike(string content) + { + return content + .Replace("\r\n", "\n") + .Split(new[] { '\n', ',', ';' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) + .Select(item => item.Trim().TrimStart('-', '•', '*', ' ')) + .Where(item => item.Length > 1) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + private static List ParseLanguagesHeuristically(string content) + { + var languages = new List(); + foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase)) + { + var name = NullIfWhitespace(match.Groups[1].Value); + var level = NullIfWhitespace(match.Groups[2].Value); + if (name is null) continue; + languages.Add(new StructuredCvLanguage { Name = name, Level = level }); + } + + return languages + .GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase) + .Select(group => group.First()) + .ToList(); + } + + private static List ParseEducationHeuristically(string content) + { + var blocks = Regex.Split(content, @"\n\s*\n") + .Select(block => block.Trim()) + .Where(block => block.Length > 0) + .ToList(); + + var items = new List(); + foreach (var block in blocks) + { + var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList(); + if (lines.Count == 0) continue; + + var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-–]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase); + var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' '); + var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-–]")); + if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0]; + + if (qualificationLine is null && institutionLine is null) continue; + items.Add(new StructuredCvEducation + { + Qualification = TitleCasePreservingAcronyms(qualificationLine), + Institution = TitleCasePreservingAcronyms(institutionLine), + Start = dateMatch.Success ? dateMatch.Groups[1].Value : null, + End = dateMatch.Success ? dateMatch.Groups[2].Value : null, + Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(), + }); + } + + return items; + } + + private static List ParseJobsHeuristically(string content) + { + var normalized = content.Replace("\r\n", "\n"); + var pattern = new Regex(@"(?[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline); + var jobs = new List<StructuredCvJob>(); + + foreach (Match match in pattern.Matches(normalized)) + { + var body = match.Groups["body"].Value.Trim(); + var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value); + var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-–]\s*"); + var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6); + + jobs.Add(new StructuredCvJob + { + Title = TitleCasePreservingAcronyms(match.Groups["title"].Value), + Company = employer, + Start = NullIfWhitespace(dates.FirstOrDefault()), + End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()), + IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase), + Bullets = bullets, + Skills = bullets.SelectMany(SplitListLike).Where(item => Regex.IsMatch(item, @"^(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)$", RegexOptions.IgnoreCase)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(), + }); + } + + return jobs; + } + + private static string? TitleCasePreservingAcronyms(string? value) + { + if (string.IsNullOrWhiteSpace(value)) return null; + + var words = value.Trim() + .Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Select(word => word.Length <= 3 && word.All(char.IsUpper) + ? word + : char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()) + .ToArray(); + + return string.Join(" ", words); + } + private static int CountWords(string? text) { if (string.IsNullOrWhiteSpace(text)) return 0; return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length; } + private static string? NullIfWhitespace(string? value) + { + return string.IsNullOrWhiteSpace(value) ? null : value.Trim(); + } + private static List<(string Name, string Content)> ParseSections(string source) { var lines = source.Replace("\r\n", "\n").Split('\n'); diff --git a/Models/StructuredCvProfileJson.cs b/Models/StructuredCvProfileJson.cs index 3d8fcee..d5536f5 100644 --- a/Models/StructuredCvProfileJson.cs +++ b/Models/StructuredCvProfileJson.cs @@ -55,12 +55,24 @@ public static class StructuredCvProfileJson primary.Contact.Website ??= secondary.Contact.Website; primary.Contact.LinkedIn ??= secondary.Contact.LinkedIn; - if (primary.Summary.Count == 0) primary.Summary = secondary.Summary; + primary.Summary = primary.Summary.Count == 0 + ? secondary.Summary + : primary.Summary.Concat(secondary.Summary).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); if (primary.Jobs.Count == 0) primary.Jobs = secondary.Jobs; if (primary.Education.Count == 0) primary.Education = secondary.Education; - if (primary.Skills.Count == 0) primary.Skills = secondary.Skills; - if (primary.Languages.Count == 0) primary.Languages = secondary.Languages; - if (primary.Interests.Count == 0) primary.Interests = secondary.Interests; + primary.Skills = primary.Skills.Count == 0 + ? secondary.Skills + : primary.Skills.Concat(secondary.Skills).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); + primary.Languages = primary.Languages.Count == 0 + ? secondary.Languages + : primary.Languages + .Concat(secondary.Languages) + .GroupBy(language => language.Name ?? string.Empty, StringComparer.OrdinalIgnoreCase) + .Select(group => group.First()) + .ToList(); + primary.Interests = primary.Interests.Count == 0 + ? secondary.Interests + : primary.Interests.Concat(secondary.Interests).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); if (primary.OtherSections.Count == 0) primary.OtherSections = secondary.OtherSections; if (primary.Sections.Count == 0) primary.Sections = secondary.Sections;