Improve structured CV fallback extraction

This commit is contained in:
2026-03-28 22:56:55 +01:00
parent 3b6588397e
commit d8ab312f59
3 changed files with 325 additions and 9 deletions
@@ -128,6 +128,53 @@ public sealed class ProfileCvControllerTests
Assert.Contains(structured.Sections, section => section.Name == "Education");
}
[Fact]
public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid()
{
var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1.";
var user = new ApplicationUser();
var userManager = CreateUserManager();
userManager.Setup(x => x.GetUserAsync(It.IsAny<ClaimsPrincipal>())).ReturnsAsync(user);
userManager.Setup(x => x.UpdateAsync(user)).ReturnsAsync(IdentityResult.Success);
var aiService = new Mock<ISummarizerService>();
aiService
.Setup(x => x.ExtractTextAsync(It.IsAny<Stream>(), It.IsAny<string>(), It.IsAny<string?>(), It.IsAny<CancellationToken>()))
.ReturnsAsync(new AiTextExtractionResult(rawExtraction, false, "application/pdf", 1, rawExtraction.Length, "Resume.en.pdf"));
aiService
.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), rawExtraction, 2800, 900))
.ReturnsAsync(string.Empty);
aiService
.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny<string>(), 3200, 900))
.ReturnsAsync("not-json");
var controller = new ProfileCvController(userManager.Object, aiService.Object)
{
ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() }
};
var bytes = Encoding.UTF8.GetBytes("fake pdf bytes");
var file = new FormFile(new MemoryStream(bytes), 0, bytes.Length, "file", "Resume.en.pdf")
{
Headers = new HeaderDictionary(),
ContentType = "application/pdf"
};
var result = await controller.Upload(file);
Assert.IsType<OkObjectResult>(result);
var structured = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson);
Assert.Equal("Connor Babbington", structured.Contact.FullName);
Assert.Equal("connor.babbington@cesnimda.co.uk", structured.Contact.Email);
Assert.Equal("+47 41 33 44 70", structured.Contact.Phone);
Assert.Contains(structured.Summary, item => item.Contains("eight years of experience", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Skills, item => item.Equals("C#", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Sections, section => section.Name == "General");
}
[Fact]
public async Task Parse_returns_structured_cv_and_persists_it()
{
@@ -231,7 +231,8 @@ public sealed class ProfileCvController : ControllerBase
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var fallbackSections = ParseSections(text)
var parseSource = NormalizeTextForStructuredParsing(text);
var fallbackSections = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
@@ -240,11 +241,14 @@ public sealed class ProfileCvController : ControllerBase
})
.ToList();
var fallback = StructuredCvProfileJson.FromSections(fallbackSections);
fallback.Contact.FullName ??= GuessFullName(text);
var extracted = await TryExtractStructuredCvAsync(text, cancellationToken);
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text);
heuristicFallback.Sections = new List<StructuredCvSection>();
var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
merged.Contact.FullName ??= GuessFullName(text);
merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email);
return StructuredCvProfileJson.Normalize(merged);
}
@@ -305,12 +309,265 @@ public sealed class ProfileCvController : ControllerBase
return null;
}
private static string? GuessFullNameFromEmail(string? email)
{
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
var localPart = email[..email.IndexOf('@')].Trim();
if (string.IsNullOrWhiteSpace(localPart)) return null;
var parts = Regex.Split(localPart, @"[._-]+")
.Select(part => part.Trim())
.Where(part => part.Length > 0)
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
.ToList();
return parts.Count >= 2 ? string.Join(" ", parts) : null;
}
private static string NormalizeTextForStructuredParsing(string source)
{
if (string.IsNullOrWhiteSpace(source)) return string.Empty;
var text = source.Replace("\r\n", "\n").Trim();
if (!LooksLikeFlattenedCvExtraction(text)) return text;
text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match =>
{
var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty);
foreach (var alias in SectionAliases)
{
var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty);
if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase))
{
return $"\n\n## {alias.Value}\n";
}
}
return match.Value;
});
foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length))
{
text = Regex.Replace(
text,
$@"(?<!#)\b{Regex.Escape(alias.Key)}\b",
$"\n\n## {alias.Value}\n",
RegexOptions.IgnoreCase);
}
text = Regex.Replace(text, @"\s+\+\s+", "\n+ ");
text = Regex.Replace(text, @"\s*([•●▪◦])\s*", "\n- ");
text = Regex.Replace(text, @"\s+(\d{4}\s*[-]\s*(?:\d{4}|Present|Current))\b", "\n$1\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"\n{3,}", "\n\n");
return text.Trim();
}
private static StructuredCvProfile BuildHeuristicStructuredCv(string parseSource, string rawSource)
{
var profile = new StructuredCvProfile();
var normalized = parseSource.Replace("\r\n", "\n").Trim();
profile.Contact.Email = NullIfWhitespace(Regex.Match(rawSource, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value);
profile.Contact.Phone = NullIfWhitespace(Regex.Match(rawSource, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value);
profile.Contact.Website = NullIfWhitespace(Regex.Match(rawSource, @"\b(?:https?://)?(?:www\.)?[A-Z0-9.-]+\.[A-Z]{2,}(?:/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]*)?", RegexOptions.IgnoreCase).Value);
profile.Contact.LinkedIn = NullIfWhitespace(Regex.Match(rawSource, @"(?:linkedin(?:\.com)?/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]+)", RegexOptions.IgnoreCase).Value);
profile.Contact.FullName = GuessFullName(rawSource) ?? GuessFullNameFromEmail(profile.Contact.Email);
profile.Contact.Location = NullIfWhitespace(Regex.Match(rawSource, @"\b[A-Z][a-z]+(?:[\s-][A-Z][a-z]+)*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b").Value);
var sections = ParseSections(normalized);
var summarySection = sections.FirstOrDefault(section => section.Name == "Professional Summary");
var flattenedSummary = Regex.Match(
rawSource,
@"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?<body>.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (flattenedSummary.Success)
{
profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5);
}
else if (!string.IsNullOrWhiteSpace(summarySection.Content))
{
profile.Summary = SplitSentences(summarySection.Content, 5);
}
var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests");
if (!string.IsNullOrWhiteSpace(interestsSection.Content))
{
profile.Interests = SplitListLike(interestsSection.Content);
}
else
{
var flattenedInterests = Regex.Match(
rawSource,
@"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?<body>.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (flattenedInterests.Success)
{
profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4);
}
}
var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages");
if (!string.IsNullOrWhiteSpace(languagesSection.Content))
{
profile.Languages = ParseLanguagesHeuristically(languagesSection.Content);
}
else
{
profile.Languages = ParseLanguagesHeuristically(rawSource);
}
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in Regex.Matches(rawSource, @"(?<![A-Za-z0-9])(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)(?![A-Za-z0-9])", RegexOptions.IgnoreCase))
{
skills.Add(match.Value.Trim());
}
profile.Skills = skills.ToList();
var educationSection = sections.FirstOrDefault(section => section.Name == "Education");
if (!string.IsNullOrWhiteSpace(educationSection.Content))
{
profile.Education = ParseEducationHeuristically(educationSection.Content);
}
var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience");
if (!string.IsNullOrWhiteSpace(experienceSection.Content))
{
profile.Jobs = ParseJobsHeuristically(experienceSection.Content);
}
if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General"))
{
var general = sections.First(section => section.Name == "General");
if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0)
{
profile.Summary = SplitSentences(general.Content, 3);
}
}
return StructuredCvProfileJson.Normalize(profile);
}
private static List<string> SplitSentences(string content, int limit)
{
return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+")
.Select(value => value.Trim())
.Where(value => value.Length > 20)
.Take(limit)
.ToList();
}
private static List<string> SplitListLike(string content)
{
return content
.Replace("\r\n", "\n")
.Split(new[] { '\n', ',', ';' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
.Where(item => item.Length > 1)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
{
var languages = new List<StructuredCvLanguage>();
foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
{
var name = NullIfWhitespace(match.Groups[1].Value);
var level = NullIfWhitespace(match.Groups[2].Value);
if (name is null) continue;
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
}
return languages
.GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase)
.Select(group => group.First())
.ToList();
}
private static List<StructuredCvEducation> ParseEducationHeuristically(string content)
{
var blocks = Regex.Split(content, @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length > 0)
.ToList();
var items = new List<StructuredCvEducation>();
foreach (var block in blocks)
{
var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) continue;
var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase);
var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' ');
var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-]"));
if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0];
if (qualificationLine is null && institutionLine is null) continue;
items.Add(new StructuredCvEducation
{
Qualification = TitleCasePreservingAcronyms(qualificationLine),
Institution = TitleCasePreservingAcronyms(institutionLine),
Start = dateMatch.Success ? dateMatch.Groups[1].Value : null,
End = dateMatch.Success ? dateMatch.Groups[2].Value : null,
Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(),
});
}
return items;
}
private static List<StructuredCvJob> ParseJobsHeuristically(string content)
{
var normalized = content.Replace("\r\n", "\n");
var pattern = new Regex(@"(?<title>[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline);
var jobs = new List<StructuredCvJob>();
foreach (Match match in pattern.Matches(normalized))
{
var body = match.Groups["body"].Value.Trim();
var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value);
var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-]\s*");
var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6);
jobs.Add(new StructuredCvJob
{
Title = TitleCasePreservingAcronyms(match.Groups["title"].Value),
Company = employer,
Start = NullIfWhitespace(dates.FirstOrDefault()),
End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()),
IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase),
Bullets = bullets,
Skills = bullets.SelectMany(SplitListLike).Where(item => Regex.IsMatch(item, @"^(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)$", RegexOptions.IgnoreCase)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(),
});
}
return jobs;
}
private static string? TitleCasePreservingAcronyms(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
var words = value.Trim()
.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
? word
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())
.ToArray();
return string.Join(" ", words);
}
private static int CountWords(string? text)
{
if (string.IsNullOrWhiteSpace(text)) return 0;
return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
}
private static string? NullIfWhitespace(string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
private static List<(string Name, string Content)> ParseSections(string source)
{
var lines = source.Replace("\r\n", "\n").Split('\n');
+16 -4
View File
@@ -55,12 +55,24 @@ public static class StructuredCvProfileJson
primary.Contact.Website ??= secondary.Contact.Website;
primary.Contact.LinkedIn ??= secondary.Contact.LinkedIn;
if (primary.Summary.Count == 0) primary.Summary = secondary.Summary;
primary.Summary = primary.Summary.Count == 0
? secondary.Summary
: primary.Summary.Concat(secondary.Summary).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
if (primary.Jobs.Count == 0) primary.Jobs = secondary.Jobs;
if (primary.Education.Count == 0) primary.Education = secondary.Education;
if (primary.Skills.Count == 0) primary.Skills = secondary.Skills;
if (primary.Languages.Count == 0) primary.Languages = secondary.Languages;
if (primary.Interests.Count == 0) primary.Interests = secondary.Interests;
primary.Skills = primary.Skills.Count == 0
? secondary.Skills
: primary.Skills.Concat(secondary.Skills).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
primary.Languages = primary.Languages.Count == 0
? secondary.Languages
: primary.Languages
.Concat(secondary.Languages)
.GroupBy(language => language.Name ?? string.Empty, StringComparer.OrdinalIgnoreCase)
.Select(group => group.First())
.ToList();
primary.Interests = primary.Interests.Count == 0
? secondary.Interests
: primary.Interests.Concat(secondary.Interests).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
if (primary.OtherSections.Count == 0) primary.OtherSections = secondary.OtherSections;
if (primary.Sections.Count == 0) primary.Sections = secondary.Sections;