Improve structured CV fallback extraction
This commit is contained in:
@@ -231,7 +231,8 @@ public sealed class ProfileCvController : ControllerBase
|
||||
|
||||
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var fallbackSections = ParseSections(text)
|
||||
var parseSource = NormalizeTextForStructuredParsing(text);
|
||||
var fallbackSections = ParseSections(parseSource)
|
||||
.Select(section => new StructuredCvSection
|
||||
{
|
||||
Name = section.Name,
|
||||
@@ -240,11 +241,14 @@ public sealed class ProfileCvController : ControllerBase
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var fallback = StructuredCvProfileJson.FromSections(fallbackSections);
|
||||
fallback.Contact.FullName ??= GuessFullName(text);
|
||||
var extracted = await TryExtractStructuredCvAsync(text, cancellationToken);
|
||||
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
|
||||
var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text);
|
||||
heuristicFallback.Sections = new List<StructuredCvSection>();
|
||||
var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
|
||||
fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
|
||||
var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
|
||||
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
|
||||
merged.Contact.FullName ??= GuessFullName(text);
|
||||
merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email);
|
||||
return StructuredCvProfileJson.Normalize(merged);
|
||||
}
|
||||
|
||||
@@ -305,12 +309,265 @@ public sealed class ProfileCvController : ControllerBase
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string? GuessFullNameFromEmail(string? email)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
|
||||
var localPart = email[..email.IndexOf('@')].Trim();
|
||||
if (string.IsNullOrWhiteSpace(localPart)) return null;
|
||||
var parts = Regex.Split(localPart, @"[._-]+")
|
||||
.Select(part => part.Trim())
|
||||
.Where(part => part.Length > 0)
|
||||
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
|
||||
.ToList();
|
||||
return parts.Count >= 2 ? string.Join(" ", parts) : null;
|
||||
}
|
||||
|
||||
private static string NormalizeTextForStructuredParsing(string source)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(source)) return string.Empty;
|
||||
|
||||
var text = source.Replace("\r\n", "\n").Trim();
|
||||
if (!LooksLikeFlattenedCvExtraction(text)) return text;
|
||||
|
||||
text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match =>
|
||||
{
|
||||
var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty);
|
||||
foreach (var alias in SectionAliases)
|
||||
{
|
||||
var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty);
|
||||
if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return $"\n\n## {alias.Value}\n";
|
||||
}
|
||||
}
|
||||
|
||||
return match.Value;
|
||||
});
|
||||
|
||||
foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length))
|
||||
{
|
||||
text = Regex.Replace(
|
||||
text,
|
||||
$@"(?<!#)\b{Regex.Escape(alias.Key)}\b",
|
||||
$"\n\n## {alias.Value}\n",
|
||||
RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
text = Regex.Replace(text, @"\s+\+\s+", "\n+ ");
|
||||
text = Regex.Replace(text, @"\s*([•●▪◦])\s*", "\n- ");
|
||||
text = Regex.Replace(text, @"\s+(\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))\b", "\n$1\n", RegexOptions.IgnoreCase);
|
||||
text = Regex.Replace(text, @"\n{3,}", "\n\n");
|
||||
|
||||
return text.Trim();
|
||||
}
|
||||
|
||||
private static StructuredCvProfile BuildHeuristicStructuredCv(string parseSource, string rawSource)
|
||||
{
|
||||
var profile = new StructuredCvProfile();
|
||||
var normalized = parseSource.Replace("\r\n", "\n").Trim();
|
||||
|
||||
profile.Contact.Email = NullIfWhitespace(Regex.Match(rawSource, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value);
|
||||
profile.Contact.Phone = NullIfWhitespace(Regex.Match(rawSource, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value);
|
||||
profile.Contact.Website = NullIfWhitespace(Regex.Match(rawSource, @"\b(?:https?://)?(?:www\.)?[A-Z0-9.-]+\.[A-Z]{2,}(?:/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]*)?", RegexOptions.IgnoreCase).Value);
|
||||
profile.Contact.LinkedIn = NullIfWhitespace(Regex.Match(rawSource, @"(?:linkedin(?:\.com)?/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]+)", RegexOptions.IgnoreCase).Value);
|
||||
profile.Contact.FullName = GuessFullName(rawSource) ?? GuessFullNameFromEmail(profile.Contact.Email);
|
||||
profile.Contact.Location = NullIfWhitespace(Regex.Match(rawSource, @"\b[A-Z][a-z]+(?:[\s-][A-Z][a-z]+)*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b").Value);
|
||||
|
||||
var sections = ParseSections(normalized);
|
||||
var summarySection = sections.FirstOrDefault(section => section.Name == "Professional Summary");
|
||||
var flattenedSummary = Regex.Match(
|
||||
rawSource,
|
||||
@"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?<body>.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
if (flattenedSummary.Success)
|
||||
{
|
||||
profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5);
|
||||
}
|
||||
else if (!string.IsNullOrWhiteSpace(summarySection.Content))
|
||||
{
|
||||
profile.Summary = SplitSentences(summarySection.Content, 5);
|
||||
}
|
||||
|
||||
var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests");
|
||||
if (!string.IsNullOrWhiteSpace(interestsSection.Content))
|
||||
{
|
||||
profile.Interests = SplitListLike(interestsSection.Content);
|
||||
}
|
||||
else
|
||||
{
|
||||
var flattenedInterests = Regex.Match(
|
||||
rawSource,
|
||||
@"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?<body>.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
if (flattenedInterests.Success)
|
||||
{
|
||||
profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4);
|
||||
}
|
||||
}
|
||||
|
||||
var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages");
|
||||
if (!string.IsNullOrWhiteSpace(languagesSection.Content))
|
||||
{
|
||||
profile.Languages = ParseLanguagesHeuristically(languagesSection.Content);
|
||||
}
|
||||
else
|
||||
{
|
||||
profile.Languages = ParseLanguagesHeuristically(rawSource);
|
||||
}
|
||||
|
||||
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
foreach (Match match in Regex.Matches(rawSource, @"(?<![A-Za-z0-9])(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)(?![A-Za-z0-9])", RegexOptions.IgnoreCase))
|
||||
{
|
||||
skills.Add(match.Value.Trim());
|
||||
}
|
||||
profile.Skills = skills.ToList();
|
||||
|
||||
var educationSection = sections.FirstOrDefault(section => section.Name == "Education");
|
||||
if (!string.IsNullOrWhiteSpace(educationSection.Content))
|
||||
{
|
||||
profile.Education = ParseEducationHeuristically(educationSection.Content);
|
||||
}
|
||||
|
||||
var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience");
|
||||
if (!string.IsNullOrWhiteSpace(experienceSection.Content))
|
||||
{
|
||||
profile.Jobs = ParseJobsHeuristically(experienceSection.Content);
|
||||
}
|
||||
|
||||
if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General"))
|
||||
{
|
||||
var general = sections.First(section => section.Name == "General");
|
||||
if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0)
|
||||
{
|
||||
profile.Summary = SplitSentences(general.Content, 3);
|
||||
}
|
||||
}
|
||||
|
||||
return StructuredCvProfileJson.Normalize(profile);
|
||||
}
|
||||
|
||||
private static List<string> SplitSentences(string content, int limit)
|
||||
{
|
||||
return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+")
|
||||
.Select(value => value.Trim())
|
||||
.Where(value => value.Length > 20)
|
||||
.Take(limit)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static List<string> SplitListLike(string content)
|
||||
{
|
||||
return content
|
||||
.Replace("\r\n", "\n")
|
||||
.Split(new[] { '\n', ',', ';' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
|
||||
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
|
||||
.Where(item => item.Length > 1)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
|
||||
{
|
||||
var languages = new List<StructuredCvLanguage>();
|
||||
foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
|
||||
{
|
||||
var name = NullIfWhitespace(match.Groups[1].Value);
|
||||
var level = NullIfWhitespace(match.Groups[2].Value);
|
||||
if (name is null) continue;
|
||||
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
|
||||
}
|
||||
|
||||
return languages
|
||||
.GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase)
|
||||
.Select(group => group.First())
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static List<StructuredCvEducation> ParseEducationHeuristically(string content)
|
||||
{
|
||||
var blocks = Regex.Split(content, @"\n\s*\n")
|
||||
.Select(block => block.Trim())
|
||||
.Where(block => block.Length > 0)
|
||||
.ToList();
|
||||
|
||||
var items = new List<StructuredCvEducation>();
|
||||
foreach (var block in blocks)
|
||||
{
|
||||
var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
||||
if (lines.Count == 0) continue;
|
||||
|
||||
var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-–]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase);
|
||||
var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' ');
|
||||
var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-–]"));
|
||||
if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0];
|
||||
|
||||
if (qualificationLine is null && institutionLine is null) continue;
|
||||
items.Add(new StructuredCvEducation
|
||||
{
|
||||
Qualification = TitleCasePreservingAcronyms(qualificationLine),
|
||||
Institution = TitleCasePreservingAcronyms(institutionLine),
|
||||
Start = dateMatch.Success ? dateMatch.Groups[1].Value : null,
|
||||
End = dateMatch.Success ? dateMatch.Groups[2].Value : null,
|
||||
Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(),
|
||||
});
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static List<StructuredCvJob> ParseJobsHeuristically(string content)
|
||||
{
|
||||
var normalized = content.Replace("\r\n", "\n");
|
||||
var pattern = new Regex(@"(?<title>[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline);
|
||||
var jobs = new List<StructuredCvJob>();
|
||||
|
||||
foreach (Match match in pattern.Matches(normalized))
|
||||
{
|
||||
var body = match.Groups["body"].Value.Trim();
|
||||
var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value);
|
||||
var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-–]\s*");
|
||||
var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6);
|
||||
|
||||
jobs.Add(new StructuredCvJob
|
||||
{
|
||||
Title = TitleCasePreservingAcronyms(match.Groups["title"].Value),
|
||||
Company = employer,
|
||||
Start = NullIfWhitespace(dates.FirstOrDefault()),
|
||||
End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()),
|
||||
IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase),
|
||||
Bullets = bullets,
|
||||
Skills = bullets.SelectMany(SplitListLike).Where(item => Regex.IsMatch(item, @"^(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)$", RegexOptions.IgnoreCase)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(),
|
||||
});
|
||||
}
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
private static string? TitleCasePreservingAcronyms(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||||
|
||||
var words = value.Trim()
|
||||
.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
||||
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
|
||||
? word
|
||||
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())
|
||||
.ToArray();
|
||||
|
||||
return string.Join(" ", words);
|
||||
}
|
||||
|
||||
private static int CountWords(string? text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return 0;
|
||||
return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
|
||||
}
|
||||
|
||||
private static string? NullIfWhitespace(string? value)
|
||||
{
|
||||
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
|
||||
}
|
||||
|
||||
private static List<(string Name, string Content)> ParseSections(string source)
|
||||
{
|
||||
var lines = source.Replace("\r\n", "\n").Split('\n');
|
||||
|
||||
Reference in New Issue
Block a user