Improve CV rewrite flow and parser accuracy
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
using System.Reflection;
|
||||
using System.Text.Json;
|
||||
using JobTrackerApi.Controllers;
|
||||
using JobTrackerApi.Models;
|
||||
using JobTrackerApi.Services;
|
||||
using JobTrackerApi.Tests.TestSupport;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.AspNetCore.Identity;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Moq;
|
||||
using Xunit;
|
||||
|
||||
namespace JobTrackerApi.Tests;
|
||||
|
||||
public sealed class CvCorpusHarnessTests
|
||||
{
|
||||
private static readonly string CorpusRoot = "/home/pi/cvs";
|
||||
|
||||
[Fact]
|
||||
public async Task Local_cv_corpus_harness_produces_repeatable_parse_report_when_available()
|
||||
{
|
||||
if (!Directory.Exists(CorpusRoot)) return;
|
||||
|
||||
var files = Directory.EnumerateFiles(CorpusRoot, "*.*", SearchOption.TopDirectoryOnly)
|
||||
.Where(path => path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)
|
||||
|| path.EndsWith(".docx", StringComparison.OrdinalIgnoreCase)
|
||||
|| path.EndsWith(".txt", StringComparison.OrdinalIgnoreCase)
|
||||
|| path.EndsWith(".md", StringComparison.OrdinalIgnoreCase))
|
||||
.OrderBy(path => path, StringComparer.OrdinalIgnoreCase)
|
||||
.Take(8)
|
||||
.ToList();
|
||||
|
||||
if (files.Count == 0) return;
|
||||
|
||||
var user = new ApplicationUser { Id = "user-1", ProfileCvText = "seed" };
|
||||
var userManager = TestHostFactory.CreateUserManager();
|
||||
userManager.Setup(x => x.GetUserAsync(It.IsAny<System.Security.Claims.ClaimsPrincipal>())).ReturnsAsync(user);
|
||||
userManager.Setup(x => x.UpdateAsync(It.IsAny<ApplicationUser>())).ReturnsAsync(IdentityResult.Success);
|
||||
|
||||
var aiService = new Mock<ISummarizerService>();
|
||||
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny<string>(), 3200, 900)).ReturnsAsync(string.Empty);
|
||||
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), It.IsAny<string>(), 2800, 900)).ReturnsAsync((string _, string text, int _, int __) => text);
|
||||
|
||||
await using var db = TestHostFactory.CreateInMemoryDb();
|
||||
var paths = CreatePaths();
|
||||
var controller = new ProfileCvController(userManager.Object, aiService.Object, db, paths, NoOpCvAiClassifier.Instance)
|
||||
{
|
||||
ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() }
|
||||
};
|
||||
|
||||
var extractMethod = typeof(ProfileCvController).GetMethod("ExtractTextAsync", BindingFlags.NonPublic | BindingFlags.Static);
|
||||
var buildMethod = typeof(ProfileCvController).GetMethod("BuildStructuredCvAsync", BindingFlags.NonPublic | BindingFlags.Instance);
|
||||
Assert.NotNull(extractMethod);
|
||||
Assert.NotNull(buildMethod);
|
||||
|
||||
var report = new List<object>();
|
||||
foreach (var path in files)
|
||||
{
|
||||
await using var stream = File.OpenRead(path);
|
||||
var fileName = Path.GetFileName(path);
|
||||
var formFile = new FormFile(stream, 0, stream.Length, "file", fileName)
|
||||
{
|
||||
Headers = new HeaderDictionary(),
|
||||
ContentType = GuessContentType(path)
|
||||
};
|
||||
|
||||
var extension = Path.GetExtension(path);
|
||||
var extractTask = (Task<string>)extractMethod!.Invoke(null, new object[] { formFile, extension })!;
|
||||
var text = await extractTask;
|
||||
Assert.False(string.IsNullOrWhiteSpace(text));
|
||||
|
||||
var buildTask = (Task<StructuredCvProfile>)buildMethod!.Invoke(controller, new object[] { text, CancellationToken.None })!;
|
||||
var structured = await buildTask;
|
||||
Assert.NotNull(structured);
|
||||
|
||||
report.Add(new
|
||||
{
|
||||
file = fileName,
|
||||
characters = text.Length,
|
||||
contactLocation = structured.Contact.Location,
|
||||
firstJob = structured.Jobs.FirstOrDefault()?.Title,
|
||||
firstJobLocation = structured.Jobs.FirstOrDefault()?.Location,
|
||||
firstEducation = structured.Education.FirstOrDefault()?.Qualification,
|
||||
firstEducationLocation = structured.Education.FirstOrDefault()?.Location,
|
||||
suspiciousLocations = structured.Jobs.Select(job => job.Location)
|
||||
.Concat(structured.Education.Select(education => education.Location))
|
||||
.Append(structured.Contact.Location)
|
||||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||||
.Where(LooksSuspiciousLocation)
|
||||
.ToList()
|
||||
});
|
||||
}
|
||||
|
||||
var reportPath = Path.Combine(Path.GetTempPath(), $"jobtracker-cv-corpus-{DateTime.UtcNow:yyyyMMddHHmmss}.json");
|
||||
await File.WriteAllTextAsync(reportPath, JsonSerializer.Serialize(report, new JsonSerializerOptions { WriteIndented = true }));
|
||||
|
||||
Assert.True(report.Count > 0);
|
||||
}
|
||||
|
||||
private static bool LooksSuspiciousLocation(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return false;
|
||||
return value.Contains("Python", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Contains("Ruby", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Contains(" S A L E S ", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Any(char.IsDigit);
|
||||
}
|
||||
|
||||
private static string GuessContentType(string path)
|
||||
{
|
||||
return Path.GetExtension(path).ToLowerInvariant() switch
|
||||
{
|
||||
".pdf" => "application/pdf",
|
||||
".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
".md" => "text/markdown",
|
||||
_ => "text/plain"
|
||||
};
|
||||
}
|
||||
|
||||
private static AppPaths CreatePaths()
|
||||
{
|
||||
var tempRoot = Path.Combine(Path.GetTempPath(), $"jobtracker-cv-corpus-{Guid.NewGuid():N}");
|
||||
Directory.CreateDirectory(tempRoot);
|
||||
|
||||
var config = new ConfigurationBuilder()
|
||||
.AddInMemoryCollection(new Dictionary<string, string?>
|
||||
{
|
||||
["Data:Root"] = tempRoot,
|
||||
["Data:CvArtifactsRoot"] = Path.Combine(tempRoot, "CvArtifacts")
|
||||
})
|
||||
.Build();
|
||||
|
||||
var env = new Mock<IHostEnvironment>();
|
||||
env.SetupGet(x => x.ContentRootPath).Returns(tempRoot);
|
||||
return new AppPaths(config, env.Object);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user