Improve CV rewrite flow and parser accuracy

This commit is contained in:
2026-04-01 11:30:37 +02:00
parent f402213526
commit f22c6791a7
9 changed files with 581 additions and 55 deletions
+139
View File
@@ -0,0 +1,139 @@
using System.Reflection;
using System.Text.Json;
using JobTrackerApi.Controllers;
using JobTrackerApi.Models;
using JobTrackerApi.Services;
using JobTrackerApi.Tests.TestSupport;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Identity;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Hosting;
using Moq;
using Xunit;
namespace JobTrackerApi.Tests;
public sealed class CvCorpusHarnessTests
{
private static readonly string CorpusRoot = "/home/pi/cvs";
[Fact]
public async Task Local_cv_corpus_harness_produces_repeatable_parse_report_when_available()
{
if (!Directory.Exists(CorpusRoot)) return;
var files = Directory.EnumerateFiles(CorpusRoot, "*.*", SearchOption.TopDirectoryOnly)
.Where(path => path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".docx", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".txt", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".md", StringComparison.OrdinalIgnoreCase))
.OrderBy(path => path, StringComparer.OrdinalIgnoreCase)
.Take(8)
.ToList();
if (files.Count == 0) return;
var user = new ApplicationUser { Id = "user-1", ProfileCvText = "seed" };
var userManager = TestHostFactory.CreateUserManager();
userManager.Setup(x => x.GetUserAsync(It.IsAny<System.Security.Claims.ClaimsPrincipal>())).ReturnsAsync(user);
userManager.Setup(x => x.UpdateAsync(It.IsAny<ApplicationUser>())).ReturnsAsync(IdentityResult.Success);
var aiService = new Mock<ISummarizerService>();
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny<string>(), 3200, 900)).ReturnsAsync(string.Empty);
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), It.IsAny<string>(), 2800, 900)).ReturnsAsync((string _, string text, int _, int __) => text);
await using var db = TestHostFactory.CreateInMemoryDb();
var paths = CreatePaths();
var controller = new ProfileCvController(userManager.Object, aiService.Object, db, paths, NoOpCvAiClassifier.Instance)
{
ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() }
};
var extractMethod = typeof(ProfileCvController).GetMethod("ExtractTextAsync", BindingFlags.NonPublic | BindingFlags.Static);
var buildMethod = typeof(ProfileCvController).GetMethod("BuildStructuredCvAsync", BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(extractMethod);
Assert.NotNull(buildMethod);
var report = new List<object>();
foreach (var path in files)
{
await using var stream = File.OpenRead(path);
var fileName = Path.GetFileName(path);
var formFile = new FormFile(stream, 0, stream.Length, "file", fileName)
{
Headers = new HeaderDictionary(),
ContentType = GuessContentType(path)
};
var extension = Path.GetExtension(path);
var extractTask = (Task<string>)extractMethod!.Invoke(null, new object[] { formFile, extension })!;
var text = await extractTask;
Assert.False(string.IsNullOrWhiteSpace(text));
var buildTask = (Task<StructuredCvProfile>)buildMethod!.Invoke(controller, new object[] { text, CancellationToken.None })!;
var structured = await buildTask;
Assert.NotNull(structured);
report.Add(new
{
file = fileName,
characters = text.Length,
contactLocation = structured.Contact.Location,
firstJob = structured.Jobs.FirstOrDefault()?.Title,
firstJobLocation = structured.Jobs.FirstOrDefault()?.Location,
firstEducation = structured.Education.FirstOrDefault()?.Qualification,
firstEducationLocation = structured.Education.FirstOrDefault()?.Location,
suspiciousLocations = structured.Jobs.Select(job => job.Location)
.Concat(structured.Education.Select(education => education.Location))
.Append(structured.Contact.Location)
.Where(value => !string.IsNullOrWhiteSpace(value))
.Where(LooksSuspiciousLocation)
.ToList()
});
}
var reportPath = Path.Combine(Path.GetTempPath(), $"jobtracker-cv-corpus-{DateTime.UtcNow:yyyyMMddHHmmss}.json");
await File.WriteAllTextAsync(reportPath, JsonSerializer.Serialize(report, new JsonSerializerOptions { WriteIndented = true }));
Assert.True(report.Count > 0);
}
private static bool LooksSuspiciousLocation(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return false;
return value.Contains("Python", StringComparison.OrdinalIgnoreCase)
|| value.Contains("Ruby", StringComparison.OrdinalIgnoreCase)
|| value.Contains(" S A L E S ", StringComparison.OrdinalIgnoreCase)
|| value.Any(char.IsDigit);
}
private static string GuessContentType(string path)
{
return Path.GetExtension(path).ToLowerInvariant() switch
{
".pdf" => "application/pdf",
".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".md" => "text/markdown",
_ => "text/plain"
};
}
private static AppPaths CreatePaths()
{
var tempRoot = Path.Combine(Path.GetTempPath(), $"jobtracker-cv-corpus-{Guid.NewGuid():N}");
Directory.CreateDirectory(tempRoot);
var config = new ConfigurationBuilder()
.AddInMemoryCollection(new Dictionary<string, string?>
{
["Data:Root"] = tempRoot,
["Data:CvArtifactsRoot"] = Path.Combine(tempRoot, "CvArtifacts")
})
.Build();
var env = new Mock<IHostEnvironment>();
env.SetupGet(x => x.ContentRootPath).Returns(tempRoot);
return new AppPaths(config, env.Object);
}
}
@@ -431,7 +431,7 @@ public sealed class ProfileCvControllerTests
{
"version": "1",
"contact": {
"location": "Tønsberg, Norway",
"location": "Python,Ruby",
"website": "https://cesnimda.co.uk/about",
"linkedin": "linkedin.com/in/demo-user?trk=foo"
},
@@ -456,9 +456,28 @@ public sealed class ProfileCvControllerTests
"isCurrent": false,
"bullets": ["Kept services running"],
"skills": []
},
{
"title": "Developer",
"company": "Demo Co",
"location": "Warwickshire College, UK S A L E S R E P R E S E N T A T I V E",
"start": "2021",
"end": "2022",
"isCurrent": false,
"bullets": ["Managed account handovers"],
"skills": []
}
],
"education": [
{
"qualification": "Warwickshire College",
"institution": "ICT Level 3",
"location": "Warwickshire College, UK S A L E S R E P R E S E N T A T I V E",
"start": "2012",
"end": "2015",
"details": []
}
],
"education": [],
"skills": [],
"languages": [],
"interests": [],
@@ -466,7 +485,7 @@ public sealed class ProfileCvControllerTests
}
""");
Assert.Equal("Tønsberg, Norway", structured.Contact.Location);
Assert.Null(structured.Contact.Location);
Assert.Equal("cesnimda.co.uk", structured.Contact.Website);
Assert.Equal("https://www.linkedin.com/in/demo-user", structured.Contact.LinkedIn);
Assert.Equal("Warwickshire, England, UK", structured.Jobs[0].Location);
@@ -475,6 +494,37 @@ public sealed class ProfileCvControllerTests
Assert.Null(structured.Jobs[1].Location);
Assert.Null(structured.Jobs[1].Start);
Assert.Null(structured.Jobs[1].End);
Assert.Equal("Warwickshire College, UK", structured.Jobs[2].Location);
Assert.Equal("ICT Level 3", structured.Education[0].Qualification);
Assert.Equal("Warwickshire College", structured.Education[0].Institution);
Assert.Equal("Warwickshire College, UK", structured.Education[0].Location);
}
[Fact]
public async Task Rewrite_section_can_target_saved_job_context_and_whole_cv()
{
var user = new ApplicationUser { Id = "user-1", ProfileCvText = "Professional Summary\nBuilt backend systems." };
var userManager = CreateUserManager();
userManager.Setup(x => x.GetUserAsync(It.IsAny<ClaimsPrincipal>())).ReturnsAsync(user);
var aiService = new Mock<ISummarizerService>();
aiService
.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Harvard template", StringComparison.Ordinal) && instruction.Contains("Senior Backend Engineer", StringComparison.Ordinal)), It.Is<string>(text => text.Contains("Professional Summary", StringComparison.Ordinal)), 1800, 400))
.ReturnsAsync("Professional Summary\nSharper backend platform positioning.");
await using var db = CreateDb();
db.Companies.Add(new Company { Id = 7, Name = "Acme Systems", OwnerUserId = "user-1" });
db.JobApplications.Add(new JobApplication { Id = 42, JobTitle = "Senior Backend Engineer", Description = "Build API integrations and platform workflows.", OwnerUserId = "user-1", CompanyId = 7, Status = "Waiting", DateApplied = DateTime.UtcNow });
await db.SaveChangesAsync();
var paths = CreatePaths();
var controller = CreateController(userManager.Object, aiService.Object, db, paths);
var result = await controller.RewriteSection(new ProfileCvController.RewriteSectionRequest(null, "harvard", null, 42, "harvard"));
var ok = Assert.IsType<OkObjectResult>(result);
var json = JsonSerializer.Serialize(ok.Value);
Assert.Contains("Sharper backend platform positioning", json);
Assert.Contains("harvard", json, StringComparison.OrdinalIgnoreCase);
Assert.Contains("42", json, StringComparison.OrdinalIgnoreCase);
}
[Fact]