Improve classifier fallback for flat CV parsing
This commit is contained in:
@@ -617,9 +617,47 @@ public sealed class ProfileCvControllerTests
|
||||
Assert.Contains("Atlas Systems", matchedJob!.Company ?? string.Empty, StringComparison.Ordinal);
|
||||
Assert.Contains("Python", structured.Skills);
|
||||
Assert.Contains("SQL", structured.Skills);
|
||||
Assert.Equal("classifier", structured.Metadata.Fields["jobs[0].title"].Method);
|
||||
Assert.Equal("block-1", structured.Metadata.Fields["jobs[0].title"].SourceBlockId);
|
||||
classifier.Verify(x => x.ClassifyBlockAsync(It.IsAny<string>(), It.IsAny<CancellationToken>()), Times.AtLeastOnce());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Parse_uses_classifier_fallback_for_education_blocks_without_real_sections()
|
||||
{
|
||||
var source = "BSc Computer Science\nUniversity of Oslo\nOslo\n2016 - 2019\nGraduated with focus on distributed systems.";
|
||||
var user = new ApplicationUser { Id = "user-1", ProfileCvText = source };
|
||||
var userManager = CreateUserManager();
|
||||
userManager.Setup(x => x.GetUserAsync(It.IsAny<ClaimsPrincipal>())).ReturnsAsync(user);
|
||||
userManager.Setup(x => x.UpdateAsync(user)).ReturnsAsync(IdentityResult.Success);
|
||||
var aiService = new Mock<ISummarizerService>();
|
||||
aiService
|
||||
.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), source, 3200, 900))
|
||||
.ReturnsAsync("not-json");
|
||||
|
||||
var classifier = new Mock<ICvAiClassifier>();
|
||||
classifier
|
||||
.Setup(x => x.ClassifyBlockAsync(It.IsAny<string>(), It.IsAny<CancellationToken>()))
|
||||
.ReturnsAsync(new CvBlockClassificationResult("Education", 0.87, "education block", "BSc Computer Science", "University of Oslo", "Oslo", "2016", "2019", new List<string> { "Graduated with focus on distributed systems." }));
|
||||
|
||||
await using var db = CreateDb();
|
||||
var paths = CreatePaths();
|
||||
var controller = CreateController(userManager.Object, aiService.Object, db, paths, classifier.Object);
|
||||
|
||||
var result = await controller.Parse(new ProfileCvController.ParseCvRequest(source));
|
||||
|
||||
var ok = Assert.IsType<OkObjectResult>(result.Result);
|
||||
var json = JsonSerializer.Serialize(ok.Value);
|
||||
Assert.Contains("BSc Computer Science", json);
|
||||
Assert.Contains("University of Oslo", json);
|
||||
|
||||
var structured = StructuredCvProfileJson.Deserialize(user.ProfileCvStructureJson);
|
||||
Assert.Single(structured.Education);
|
||||
Assert.Equal("BSc Computer Science", structured.Education[0].Qualification);
|
||||
Assert.Equal("University of Oslo", structured.Education[0].Institution);
|
||||
Assert.Equal("classifier", structured.Metadata.Fields["education[0].qualification"].Method);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Parse_keeps_general_fallback_when_classifier_returns_nothing()
|
||||
{
|
||||
|
||||
@@ -78,6 +78,7 @@ public sealed class ProfileCvController : ControllerBase
|
||||
public sealed record ParseCvRequest(string? Text);
|
||||
|
||||
private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv);
|
||||
private sealed record ClassifiedCvBlock(int Index, string OriginalBlock, string SectionName, string Content, CvBlockClassificationResult? Classification);
|
||||
public sealed record CvExtractionRunListItem(
|
||||
int Id,
|
||||
string Trigger,
|
||||
@@ -340,7 +341,30 @@ public sealed class ProfileCvController : ControllerBase
|
||||
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var parseSource = NormalizeTextForStructuredParsing(text);
|
||||
var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
|
||||
var parsedSections = ParseSections(parseSource)
|
||||
.Select(section => new StructuredCvSection
|
||||
{
|
||||
Name = section.Name,
|
||||
Content = section.Content,
|
||||
WordCount = CountWords(section.Content),
|
||||
})
|
||||
.ToList();
|
||||
var hasRealSections = parsedSections.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
List<ClassifiedCvBlock> classifiedBlocks = new();
|
||||
List<StructuredCvSection> fallbackSections = parsedSections;
|
||||
StructuredCvProfile? classifierFallback = null;
|
||||
|
||||
if (!hasRealSections)
|
||||
{
|
||||
classifiedBlocks = await ClassifyBlocksAsync(parseSource, cancellationToken);
|
||||
var hasMeaningfulClassifierStructure = classifiedBlocks.Any(block => !string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase));
|
||||
if (hasMeaningfulClassifierStructure)
|
||||
{
|
||||
fallbackSections = BuildSectionsFromClassifiedBlocks(classifiedBlocks);
|
||||
classifierFallback = BuildStructuredCvFromClassifiedBlocks(classifiedBlocks);
|
||||
}
|
||||
}
|
||||
|
||||
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
|
||||
AnnotateStructuredCv(sectionFallback, "repair", 0.56);
|
||||
@@ -348,6 +372,10 @@ public sealed class ProfileCvController : ControllerBase
|
||||
AnnotateStructuredCv(heuristicFallback, "deterministic", 0.68);
|
||||
heuristicFallback.Sections = new List<StructuredCvSection>();
|
||||
var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
|
||||
if (classifierFallback is not null)
|
||||
{
|
||||
fallback = StructuredCvProfileJson.Merge(classifierFallback, fallback);
|
||||
}
|
||||
fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
|
||||
var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
|
||||
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
|
||||
@@ -874,36 +902,202 @@ public sealed class ProfileCvController : ControllerBase
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
|
||||
private static List<StructuredCvSection> BuildSectionsFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks)
|
||||
{
|
||||
var parsed = ParseSections(parseSource)
|
||||
.Select(section => new StructuredCvSection
|
||||
var sectionBuckets = new List<StructuredCvSection>();
|
||||
foreach (var block in classifiedBlocks)
|
||||
{
|
||||
Name = section.Name,
|
||||
Content = section.Content,
|
||||
WordCount = CountWords(section.Content),
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
|
||||
if (hasRealSections) return parsed;
|
||||
|
||||
var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
|
||||
return aiSections.Count > 0 ? aiSections : parsed;
|
||||
var existing = sectionBuckets.FirstOrDefault(section => section.Name == block.SectionName);
|
||||
if (existing is null)
|
||||
{
|
||||
sectionBuckets.Add(new StructuredCvSection { Name = block.SectionName, Content = block.Content, WordCount = CountWords(block.Content) });
|
||||
}
|
||||
else
|
||||
{
|
||||
existing.Content = $"{existing.Content}\n\n{block.Content}".Trim();
|
||||
existing.WordCount = CountWords(existing.Content);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
|
||||
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
|
||||
}
|
||||
|
||||
private static StructuredCvProfile BuildStructuredCvFromClassifiedBlocks(List<ClassifiedCvBlock> classifiedBlocks)
|
||||
{
|
||||
var profile = new StructuredCvProfile();
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var summary = new List<string>();
|
||||
var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (var block in classifiedBlocks)
|
||||
{
|
||||
switch (block.SectionName)
|
||||
{
|
||||
case "Professional Summary":
|
||||
foreach (var item in SplitClassifierContent(block.Content, 5))
|
||||
{
|
||||
summary.Add(item);
|
||||
}
|
||||
ApplyClassifierFieldMetadata(profile, "summary", summary.FirstOrDefault(), block, now);
|
||||
break;
|
||||
case "Skills":
|
||||
foreach (var item in SplitClassifierSkills(block.Content))
|
||||
{
|
||||
skills.Add(item);
|
||||
}
|
||||
ApplyClassifierFieldMetadata(profile, "skills", skills.FirstOrDefault(), block, now);
|
||||
break;
|
||||
case "Work Experience":
|
||||
var job = BuildJobFromClassifiedBlock(block);
|
||||
if (job is not null)
|
||||
{
|
||||
var index = profile.Jobs.Count;
|
||||
profile.Jobs.Add(job);
|
||||
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].title", job.Title, block, now);
|
||||
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].company", job.Company, block, now);
|
||||
ApplyClassifierFieldMetadata(profile, $"jobs[{index}].location", job.Location, block, now);
|
||||
}
|
||||
break;
|
||||
case "Education":
|
||||
var education = BuildEducationFromClassifiedBlock(block);
|
||||
if (education is not null)
|
||||
{
|
||||
var index = profile.Education.Count;
|
||||
profile.Education.Add(education);
|
||||
ApplyClassifierFieldMetadata(profile, $"education[{index}].qualification", education.Qualification, block, now);
|
||||
ApplyClassifierFieldMetadata(profile, $"education[{index}].institution", education.Institution, block, now);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (!string.IsNullOrWhiteSpace(block.Content))
|
||||
{
|
||||
profile.OtherSections.Add(new StructuredCvOtherSection
|
||||
{
|
||||
Title = block.SectionName,
|
||||
Items = SplitClassifierContent(block.Content, 6)
|
||||
});
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
profile.Summary = summary.Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||||
profile.Skills = skills.ToList();
|
||||
profile.Sections = BuildSectionsFromClassifiedBlocks(classifiedBlocks);
|
||||
|
||||
var averageConfidence = classifiedBlocks
|
||||
.Select(block => block.Classification?.Confidence)
|
||||
.Where(value => value.HasValue)
|
||||
.Select(value => value!.Value)
|
||||
.DefaultIfEmpty(0.74)
|
||||
.Average();
|
||||
AnnotateStructuredCv(profile, "classifier", averageConfidence);
|
||||
return StructuredCvProfileJson.Normalize(profile);
|
||||
}
|
||||
|
||||
private static StructuredCvJob? BuildJobFromClassifiedBlock(ClassifiedCvBlock block)
|
||||
{
|
||||
var classification = block.Classification;
|
||||
if (classification is null) return null;
|
||||
|
||||
var bullets = classification.Bullets is { Count: > 0 }
|
||||
? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList()
|
||||
: SplitClassifierContent(block.OriginalBlock, 6);
|
||||
|
||||
var job = new StructuredCvJob
|
||||
{
|
||||
Title = NullIfWhitespace(classification.Title),
|
||||
Company = NullIfWhitespace(classification.Company),
|
||||
Location = NullIfWhitespace(classification.Location),
|
||||
Start = NullIfWhitespace(classification.Start),
|
||||
End = NullIfWhitespace(classification.End),
|
||||
IsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase),
|
||||
Bullets = bullets,
|
||||
Skills = SplitClassifierSkills(block.OriginalBlock)
|
||||
};
|
||||
|
||||
return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Jobs = new List<StructuredCvJob> { job } }).Jobs.FirstOrDefault();
|
||||
}
|
||||
|
||||
private static StructuredCvEducation? BuildEducationFromClassifiedBlock(ClassifiedCvBlock block)
|
||||
{
|
||||
var classification = block.Classification;
|
||||
if (classification is null) return null;
|
||||
|
||||
var education = new StructuredCvEducation
|
||||
{
|
||||
Qualification = NullIfWhitespace(classification.Title),
|
||||
Institution = NullIfWhitespace(classification.Company),
|
||||
Location = NullIfWhitespace(classification.Location),
|
||||
Start = NullIfWhitespace(classification.Start),
|
||||
End = NullIfWhitespace(classification.End),
|
||||
Details = classification.Bullets is { Count: > 0 }
|
||||
? classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => bullet.Trim()).ToList()
|
||||
: SplitClassifierContent(block.OriginalBlock, 5)
|
||||
};
|
||||
|
||||
return StructuredCvProfileJson.Normalize(new StructuredCvProfile { Education = new List<StructuredCvEducation> { education } }).Education.FirstOrDefault();
|
||||
}
|
||||
|
||||
private static List<string> SplitClassifierContent(string content, int limit)
|
||||
{
|
||||
return content
|
||||
.Replace("\r\n", "\n")
|
||||
.Split(new[] { '\n', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
|
||||
.SelectMany(line => line.Contains(". ", StringComparison.Ordinal)
|
||||
? Regex.Split(line, @"(?<=[.!?])\s+")
|
||||
: new[] { line })
|
||||
.Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' '))
|
||||
.Where(item => item.Length > 2)
|
||||
.Take(limit)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static List<string> SplitClassifierSkills(string content)
|
||||
{
|
||||
return content
|
||||
.Replace("\r\n", "\n")
|
||||
.Split(new[] { '\n', ',', ';', '•' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
|
||||
.Select(item => item.Trim().TrimStart('-', '•', '*', '+', ' '))
|
||||
.Where(item => item.Length > 1 && item.Length <= 48 && !LooksLikeDateLikeValue(item) && !item.Contains('@'))
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static bool LooksLikeDateLikeValue(string value)
|
||||
{
|
||||
return Regex.IsMatch(value, @"^(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
private static void ApplyClassifierFieldMetadata(StructuredCvProfile profile, string key, string? value, ClassifiedCvBlock block, DateTimeOffset now)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return;
|
||||
|
||||
profile.Metadata.Fields[key] = new StructuredCvFieldMetadata
|
||||
{
|
||||
Confidence = block.Classification?.Confidence ?? 0.74,
|
||||
Method = "classifier",
|
||||
SourceSnippet = block.OriginalBlock.Length > 180 ? block.OriginalBlock[..180] : block.OriginalBlock,
|
||||
SourceBlockId = $"block-{block.Index}",
|
||||
ReviewState = string.Equals(block.SectionName, "General", StringComparison.OrdinalIgnoreCase) ? "needs-review" : "suggested",
|
||||
LastUpdatedAtUtc = now,
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<List<ClassifiedCvBlock>> ClassifyBlocksAsync(string parseSource, CancellationToken cancellationToken)
|
||||
{
|
||||
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
|
||||
.Select(block => block.Trim())
|
||||
.Where(block => block.Length >= 24)
|
||||
.ToList();
|
||||
|
||||
if (blocks.Count == 0) return new List<StructuredCvSection>();
|
||||
if (blocks.Count == 0) return new List<ClassifiedCvBlock>();
|
||||
|
||||
var sectionBuckets = new List<StructuredCvSection>();
|
||||
foreach (var block in blocks)
|
||||
var results = new List<ClassifiedCvBlock>();
|
||||
for (var index = 0; index < blocks.Count; index++)
|
||||
{
|
||||
var block = blocks[index];
|
||||
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
|
||||
var sectionName = classification?.Section;
|
||||
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
|
||||
@@ -931,20 +1125,33 @@ public sealed class ProfileCvController : ControllerBase
|
||||
}
|
||||
if (lines.Count > 0) content = string.Join("\n", lines);
|
||||
}
|
||||
|
||||
var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
|
||||
if (existing is null)
|
||||
else if (string.Equals(sectionName, "Education", StringComparison.OrdinalIgnoreCase) && classification is not null)
|
||||
{
|
||||
sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
|
||||
}
|
||||
else
|
||||
var lines = new List<string>();
|
||||
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
|
||||
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, false);
|
||||
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||||
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
|
||||
if (classification.Bullets is not null)
|
||||
{
|
||||
existing.Content = $"{existing.Content}\n\n{content}".Trim();
|
||||
existing.WordCount = CountWords(existing.Content);
|
||||
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
|
||||
}
|
||||
if (lines.Count > 0) content = string.Join("\n", lines);
|
||||
}
|
||||
else if (string.Equals(sectionName, "Skills", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var items = SplitClassifierSkills(block);
|
||||
if (items.Count > 0) content = string.Join("\n", items);
|
||||
}
|
||||
else if (string.Equals(sectionName, "Professional Summary", StringComparison.OrdinalIgnoreCase) && classification?.Bullets is { Count: > 0 })
|
||||
{
|
||||
content = string.Join("\n", classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
|
||||
}
|
||||
|
||||
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
|
||||
results.Add(new ClassifiedCvBlock(index + 1, block, sectionName, content, classification));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
|
||||
|
||||
Reference in New Issue
Block a user