Improve classifier fallback for flat CV parsing

This commit is contained in:
2026-04-01 11:00:53 +02:00
parent 517c42250d
commit b283f8b9d2
2 changed files with 274 additions and 29 deletions
+236 -29
View File
@@ -78,6 +78,7 @@ public sealed class ProfileCvController : ControllerBase
public sealed record ParseCvRequest(string? Text);
private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv);
private sealed record ClassifiedCvBlock(int Index, string OriginalBlock, string SectionName, string Content, CvBlockClassificationResult? Classification);
public sealed record CvExtractionRunListItem(
int Id,
string Trigger,
@@ -340,7 +341,30 @@ public sealed class ProfileCvController : ControllerBase
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var parseSource = NormalizeTextForStructuredParsing(text);
var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
var parsedSections = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var hasRealSections = parsedSections.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
List<ClassifiedCvBlock> classifiedBlocks = new();
List<StructuredCvSection> fallbackSections = parsedSections;
StructuredCvProfile? classifierFallback = null;
if (!hasRealSections)
{
classifiedBlocks = await ClassifyBlocksAsync(parseSource, cancellationToken);
var hasMeaningfulClassifierStructure = classifiedBlocks.Any(block => !string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase));
if (hasMeaningfulClassifierStructure)
{
fallbackSections = BuildSectionsFromClassifiedBlocks(classifiedBlocks);
classifierFallback = BuildStructuredCvFromClassifiedBlocks(classifiedBlocks);
}
}
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
AnnotateStructuredCv(sectionFallback, "repair", 0.56);
@@ -348,6 +372,10 @@ public sealed class ProfileCvController : ControllerBase
AnnotateStructuredCv(heuristicFallback, "deterministic", 0.68);
heuristicFallback.Sections = new List<StructuredCvSection>();
var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
if (classifierFallback is not null)
{
fallback = StructuredCvProfileJson.Merge(classifierFallback, fallback);
}
fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
@@ -874,36 +902,202 @@ public sealed class ProfileCvController : ControllerBase
.ToList();
}
private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
private static List<StructuredCvSection> BuildSectionsFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks)
{
var parsed = ParseSections(parseSource)
.Select(section => new StructuredCvSection
var sectionBuckets = new List<StructuredCvSection>();
foreach (var block in classifiedBlocks)
{
var existing = sectionBuckets.FirstOrDefault(section => section.Name == block.SectionName);
if (existing is null)
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
sectionBuckets.Add(new StructuredCvSection { Name = block.SectionName, Content = block.Content, WordCount = CountWords(block.Content) });
}
else
{
existing.Content = $"{existing.Content}\n\n{block.Content}".Trim();
existing.WordCount = CountWords(existing.Content);
}
}
var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
if (hasRealSections) return parsed;
var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
return aiSections.Count > 0 ? aiSections : parsed;
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
}
private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
private static StructuredCvProfile BuildStructuredCvFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks)
{
var profile = new StructuredCvProfile();
var now = DateTimeOffset.UtcNow;
var summary = new List<string>();
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (var block in classifiedBlocks)
{
switch (block.SectionName)
{
case "Professional Summary":
foreach (var item in SplitClassifierContent(block.Content, 5))
{
summary.Add(item);
}
ApplyClassifierFieldMetadata(profile, "summary", summary.FirstOrDefault(), block, now);
break;
case "Skills":
foreach (var item in SplitClassifierSkills(block.Content))
{
skills.Add(item);
}
ApplyClassifierFieldMetadata(profile, "skills", skills.FirstOrDefault(), block, now);
break;
case "Work Experience":
var job = BuildJobFromClassifiedBlock(block);
if (job is not null)
{
var index = profile.Jobs.Count;
profile.Jobs.Add(job);
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].title", job.Title, block, now);
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].company", job.Company, block, now);
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].location", job.Location, block, now);
}
break;
case "Education":
var education = BuildEducationFromClassifiedBlock(block);
if (education is not null)
{
var index = profile.Education.Count;
profile.Education.Add(education);
ApplyClassifierFieldMetadata(profile, $"education[{index}].qualification", education.Qualification, block, now);
ApplyClassifierFieldMetadata(profile, $"education[{index}].institution", education.Institution, block, now);
}
break;
default:
if (!string.IsNullOrWhiteSpace(block.Content))
{
profile.OtherSections.Add(new StructuredCvOtherSection
{
Title = block.SectionName,
Items = SplitClassifierContent(block.Content, 6)
});
}
break;
}
}
profile.Summary = summary.Distinct(StringComparer.OrdinalIgnoreCase).ToList();
profile.Skills = skills.ToList();
profile.Sections = BuildSectionsFromClassifiedBlocks(classifiedBlocks);
var averageConfidence = classifiedBlocks
.Select(block => block.Classification?.Confidence)
.Where(value => value.HasValue)
.Select(value => value!.Value)
.DefaultIfEmpty(0.74)
.Average();
AnnotateStructuredCv(profile, "classifier", averageConfidence);
return StructuredCvProfileJson.Normalize(profile);
}
private static StructuredCvJob? BuildJobFromClassifiedBlock(ClassifiedCvBlock block)
{
var classification = block.Classification;
if (classification is null) return null;
var bullets = classification.Bullets is { Count: > 0 }
? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList()
: SplitClassifierContent(block.OriginalBlock, 6);
var job = new StructuredCvJob
{
Title = NullIfWhitespace(classification.Title),
Company = NullIfWhitespace(classification.Company),
Location = NullIfWhitespace(classification.Location),
Start = NullIfWhitespace(classification.Start),
End = NullIfWhitespace(classification.End),
IsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase),
Bullets = bullets,
Skills = SplitClassifierSkills(block.OriginalBlock)
};
return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Jobs = new List<StructuredCvJob> { job } }).Jobs.FirstOrDefault();
}
private static StructuredCvEducation? BuildEducationFromClassifiedBlock(ClassifiedCvBlock block)
{
var classification = block.Classification;
if (classification is null) return null;
var education = new StructuredCvEducation
{
Qualification = NullIfWhitespace(classification.Title),
Institution = NullIfWhitespace(classification.Company),
Location = NullIfWhitespace(classification.Location),
Start = NullIfWhitespace(classification.Start),
End = NullIfWhitespace(classification.End),
Details = classification.Bullets is { Count: > 0 }
? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList()
: SplitClassifierContent(block.OriginalBlock, 5)
};
return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Education = new List<StructuredCvEducation> { education } }).Education.FirstOrDefault();
}
private static List<string> SplitClassifierContent(string content, int limit)
{
return content
.Replace("\r\n", "\n")
.Split(new[] { '\n', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
.SelectMany(line => line.Contains(". ", StringComparison.Ordinal)
? Regex.Split(line, @"(?<=[.!?])\s+")
: new[] { line })
.Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' '))
.Where(item => item.Length > 2)
.Take(limit)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static List<string> SplitClassifierSkills(string content)
{
return content
.Replace("\r\n", "\n")
.Split(new[] { '\n', ',', ';', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
.Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' '))
.Where(item => item.Length > 1 && item.Length <= 48 && !LooksLikeDateLikeValue(item) && !item.Contains('@'))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static bool LooksLikeDateLikeValue(string value)
{
return Regex.IsMatch(value, @"^(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current)(?:\s*[-]\s*(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
}
private static void ApplyClassifierFieldMetadata(StructuredCvProfile profile, string key, string? value, ClassifiedCvBlock block, DateTimeOffset now)
{
if (string.IsNullOrWhiteSpace(value)) return;
profile.Metadata.Fields[key] = new StructuredCvFieldMetadata
{
Confidence = block.Classification?.Confidence ?? 0.74,
Method = "classifier",
SourceSnippet = block.OriginalBlock.Length > 180 ? block.OriginalBlock[..180] : block.OriginalBlock,
SourceBlockId = $"block-{block.Index}",
ReviewState = string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase) ? "needs-review" : "suggested",
LastUpdatedAtUtc = now,
};
}
private async Task<List<ClassifiedCvBlock>> ClassifyBlocksAsync(string parseSource, CancellationToken cancellationToken)
{
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length >= 24)
.ToList();
if (blocks.Count == 0) return new List<StructuredCvSection>();
if (blocks.Count == 0) return new List<ClassifiedCvBlock>();
var sectionBuckets = new List<StructuredCvSection>();
foreach (var block in blocks)
var results = new List<ClassifiedCvBlock>();
for (var index = 0; index < blocks.Count; index++)
{
var block = blocks[index];
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
var sectionName = classification?.Section;
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
@@ -931,20 +1125,33 @@ public sealed class ProfileCvController : ControllerBase
}
if (lines.Count > 0) content = string.Join("\n", lines);
}
else if (string.Equals(sectionName, "Education", StringComparison.OrdinalIgnoreCase) && classification is not null)
{
var lines = new List<string>();
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, false);
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
if (classification.Bullets is not null)
{
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
}
if (lines.Count > 0) content = string.Join("\n", lines);
}
else if (string.Equals(sectionName, "Skills", StringComparison.OrdinalIgnoreCase))
{
var items = SplitClassifierSkills(block);
if (items.Count > 0) content = string.Join("\n", items);
}
else if (string.Equals(sectionName, "Professional Summary", StringComparison.OrdinalIgnoreCase) && classification?.Bullets is { Count: > 0 })
{
content = string.Join("\n", classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
}
var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
if (existing is null)
{
sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
}
else
{
existing.Content = $"{existing.Content}\n\n{content}".Trim();
existing.WordCount = CountWords(existing.Content);
}
results.Add(new ClassifiedCvBlock(index + 1, block, sectionName, content, classification));
}
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
return results;
}
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)