Improve structured CV fallback extraction

This commit is contained in:
2026-03-28 22:56:55 +01:00
parent 3b6588397e
commit d8ab312f59
3 changed files with 325 additions and 9 deletions
@@ -231,7 +231,8 @@ public sealed class ProfileCvController : ControllerBase
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var fallbackSections = ParseSections(text)
var parseSource = NormalizeTextForStructuredParsing(text);
var fallbackSections = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
@@ -240,11 +241,14 @@ public sealed class ProfileCvController : ControllerBase
})
.ToList();
var fallback = StructuredCvProfileJson.FromSections(fallbackSections);
fallback.Contact.FullName ??= GuessFullName(text);
var extracted = await TryExtractStructuredCvAsync(text, cancellationToken);
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text);
heuristicFallback.Sections = new List<StructuredCvSection>();
var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
merged.Contact.FullName ??= GuessFullName(text);
merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email);
return StructuredCvProfileJson.Normalize(merged);
}
@@ -305,12 +309,265 @@ public sealed class ProfileCvController : ControllerBase
return null;
}
private static string? GuessFullNameFromEmail(string? email)
{
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
var localPart = email[..email.IndexOf('@')].Trim();
if (string.IsNullOrWhiteSpace(localPart)) return null;
var parts = Regex.Split(localPart, @"[._-]+")
.Select(part => part.Trim())
.Where(part => part.Length > 0)
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
.ToList();
return parts.Count >= 2 ? string.Join(" ", parts) : null;
}
private static string NormalizeTextForStructuredParsing(string source)
{
if (string.IsNullOrWhiteSpace(source)) return string.Empty;
var text = source.Replace("\r\n", "\n").Trim();
if (!LooksLikeFlattenedCvExtraction(text)) return text;
text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match =>
{
var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty);
foreach (var alias in SectionAliases)
{
var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty);
if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase))
{
return $"\n\n## {alias.Value}\n";
}
}
return match.Value;
});
foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length))
{
text = Regex.Replace(
text,
$@"(?<!#)\b{Regex.Escape(alias.Key)}\b",
$"\n\n## {alias.Value}\n",
RegexOptions.IgnoreCase);
}
text = Regex.Replace(text, @"\s+\+\s+", "\n+ ");
text = Regex.Replace(text, @"\s*([•●▪◦])\s*", "\n- ");
text = Regex.Replace(text, @"\s+(\d{4}\s*[-]\s*(?:\d{4}|Present|Current))\b", "\n$1\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"\n{3,}", "\n\n");
return text.Trim();
}
private static StructuredCvProfile BuildHeuristicStructuredCv(string parseSource, string rawSource)
{
var profile = new StructuredCvProfile();
var normalized = parseSource.Replace("\r\n", "\n").Trim();
profile.Contact.Email = NullIfWhitespace(Regex.Match(rawSource, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value);
profile.Contact.Phone = NullIfWhitespace(Regex.Match(rawSource, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value);
profile.Contact.Website = NullIfWhitespace(Regex.Match(rawSource, @"\b(?:https?://)?(?:www\.)?[A-Z0-9.-]+\.[A-Z]{2,}(?:/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]*)?", RegexOptions.IgnoreCase).Value);
profile.Contact.LinkedIn = NullIfWhitespace(Regex.Match(rawSource, @"(?:linkedin(?:\.com)?/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]+)", RegexOptions.IgnoreCase).Value);
profile.Contact.FullName = GuessFullName(rawSource) ?? GuessFullNameFromEmail(profile.Contact.Email);
profile.Contact.Location = NullIfWhitespace(Regex.Match(rawSource, @"\b[A-Z][a-z]+(?:[\s-][A-Z][a-z]+)*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b").Value);
var sections = ParseSections(normalized);
var summarySection = sections.FirstOrDefault(section => section.Name == "Professional Summary");
var flattenedSummary = Regex.Match(
rawSource,
@"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?<body>.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (flattenedSummary.Success)
{
profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5);
}
else if (!string.IsNullOrWhiteSpace(summarySection.Content))
{
profile.Summary = SplitSentences(summarySection.Content, 5);
}
var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests");
if (!string.IsNullOrWhiteSpace(interestsSection.Content))
{
profile.Interests = SplitListLike(interestsSection.Content);
}
else
{
var flattenedInterests = Regex.Match(
rawSource,
@"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?<body>.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (flattenedInterests.Success)
{
profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4);
}
}
var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages");
if (!string.IsNullOrWhiteSpace(languagesSection.Content))
{
profile.Languages = ParseLanguagesHeuristically(languagesSection.Content);
}
else
{
profile.Languages = ParseLanguagesHeuristically(rawSource);
}
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in Regex.Matches(rawSource, @"(?<![A-Za-z0-9])(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)(?![A-Za-z0-9])", RegexOptions.IgnoreCase))
{
skills.Add(match.Value.Trim());
}
profile.Skills = skills.ToList();
var educationSection = sections.FirstOrDefault(section => section.Name == "Education");
if (!string.IsNullOrWhiteSpace(educationSection.Content))
{
profile.Education = ParseEducationHeuristically(educationSection.Content);
}
var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience");
if (!string.IsNullOrWhiteSpace(experienceSection.Content))
{
profile.Jobs = ParseJobsHeuristically(experienceSection.Content);
}
if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General"))
{
var general = sections.First(section => section.Name == "General");
if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0)
{
profile.Summary = SplitSentences(general.Content, 3);
}
}
return StructuredCvProfileJson.Normalize(profile);
}
private static List<string> SplitSentences(string content, int limit)
{
return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+")
.Select(value => value.Trim())
.Where(value => value.Length > 20)
.Take(limit)
.ToList();
}
private static List<string> SplitListLike(string content)
{
return content
.Replace("\r\n", "\n")
.Split(new[] { '\n', ',', ';' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
.Where(item => item.Length > 1)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
{
var languages = new List<StructuredCvLanguage>();
foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
{
var name = NullIfWhitespace(match.Groups[1].Value);
var level = NullIfWhitespace(match.Groups[2].Value);
if (name is null) continue;
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
}
return languages
.GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase)
.Select(group => group.First())
.ToList();
}
private static List<StructuredCvEducation> ParseEducationHeuristically(string content)
{
var blocks = Regex.Split(content, @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length > 0)
.ToList();
var items = new List<StructuredCvEducation>();
foreach (var block in blocks)
{
var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) continue;
var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase);
var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' ');
var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-]"));
if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0];
if (qualificationLine is null && institutionLine is null) continue;
items.Add(new StructuredCvEducation
{
Qualification = TitleCasePreservingAcronyms(qualificationLine),
Institution = TitleCasePreservingAcronyms(institutionLine),
Start = dateMatch.Success ? dateMatch.Groups[1].Value : null,
End = dateMatch.Success ? dateMatch.Groups[2].Value : null,
Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(),
});
}
return items;
}
private static List<StructuredCvJob> ParseJobsHeuristically(string content)
{
var normalized = content.Replace("\r\n", "\n");
var pattern = new Regex(@"(?<title>[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline);
var jobs = new List<StructuredCvJob>();
foreach (Match match in pattern.Matches(normalized))
{
var body = match.Groups["body"].Value.Trim();
var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value);
var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-]\s*");
var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6);
jobs.Add(new StructuredCvJob
{
Title = TitleCasePreservingAcronyms(match.Groups["title"].Value),
Company = employer,
Start = NullIfWhitespace(dates.FirstOrDefault()),
End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()),
IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase),
Bullets = bullets,
Skills = bullets.SelectMany(SplitListLike).Where(item => Regex.IsMatch(item, @"^(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)$", RegexOptions.IgnoreCase)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(),
});
}
return jobs;
}
private static string? TitleCasePreservingAcronyms(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
var words = value.Trim()
.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
? word
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())
.ToArray();
return string.Join(" ", words);
}
private static int CountWords(string? text)
{
if (string.IsNullOrWhiteSpace(text)) return 0;
return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
}
private static string? NullIfWhitespace(string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
private static List<(string Name, string Content)> ParseSections(string source)
{
var lines = source.Replace("\r\n", "\n").Split('\n');