331 lines
15 KiB
C#
331 lines
15 KiB
C#
using System.Reflection;
|
|
using System.Security.Cryptography;
|
|
using System.Text.Json;
|
|
using JobTrackerApi.Controllers;
|
|
using JobTrackerApi.Models;
|
|
using JobTrackerApi.Services;
|
|
using JobTrackerApi.Tests.TestSupport;
|
|
using Microsoft.AspNetCore.Http;
|
|
using Microsoft.AspNetCore.Identity;
|
|
using Microsoft.AspNetCore.Mvc;
|
|
using Microsoft.Extensions.Configuration;
|
|
using Microsoft.Extensions.Hosting;
|
|
using Moq;
|
|
using Xunit;
|
|
|
|
namespace JobTrackerApi.Tests;
|
|
|
|
public sealed class CvCorpusHarnessTests
|
|
{
|
|
private static readonly string CorpusRoot = "/home/pi/cvs";
|
|
|
|
[Fact]
|
|
public async Task Local_cv_corpus_harness_produces_repeatable_parse_report_when_available()
|
|
{
|
|
if (!Directory.Exists(CorpusRoot)) return;
|
|
|
|
var files = Directory.EnumerateFiles(CorpusRoot, "*.*", SearchOption.TopDirectoryOnly)
|
|
.Where(path => path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)
|
|
|| path.EndsWith(".docx", StringComparison.OrdinalIgnoreCase)
|
|
|| path.EndsWith(".txt", StringComparison.OrdinalIgnoreCase)
|
|
|| path.EndsWith(".md", StringComparison.OrdinalIgnoreCase))
|
|
.OrderBy(path => path, StringComparer.OrdinalIgnoreCase)
|
|
.ToList();
|
|
|
|
if (files.Count == 0) return;
|
|
|
|
var outputRoot = ResolveOutputRoot();
|
|
var outputsDir = Path.Combine(outputRoot, "outputs");
|
|
var candidateFixturesDir = Path.Combine(outputRoot, "candidate-fixtures");
|
|
var approvedFixturesDir = ResolveApprovedFixturesRoot(outputRoot);
|
|
Directory.CreateDirectory(outputRoot);
|
|
Directory.CreateDirectory(outputsDir);
|
|
Directory.CreateDirectory(candidateFixturesDir);
|
|
Directory.CreateDirectory(approvedFixturesDir);
|
|
|
|
var user = new ApplicationUser { Id = "user-1", ProfileCvText = "seed" };
|
|
var userManager = TestHostFactory.CreateUserManager();
|
|
userManager.Setup(x => x.GetUserAsync(It.IsAny<System.Security.Claims.ClaimsPrincipal>())).ReturnsAsync(user);
|
|
userManager.Setup(x => x.UpdateAsync(It.IsAny<ApplicationUser>())).ReturnsAsync(IdentityResult.Success);
|
|
|
|
var aiService = new Mock<ISummarizerService>();
|
|
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny<string>(), 3200, 900)).ReturnsAsync(string.Empty);
|
|
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), It.IsAny<string>(), 2800, 900)).ReturnsAsync((string _, string text, int _, int __) => text);
|
|
|
|
await using var db = TestHostFactory.CreateInMemoryDb();
|
|
var paths = CreatePaths(outputRoot);
|
|
var controller = new ProfileCvController(userManager.Object, aiService.Object, db, paths, null, NoOpCvAiClassifier.Instance)
|
|
{
|
|
ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() }
|
|
};
|
|
|
|
var extractMethod = typeof(ProfileCvController).GetMethod("ExtractTextAsync", BindingFlags.NonPublic | BindingFlags.Static);
|
|
var buildMethod = typeof(ProfileCvController).GetMethod("BuildStructuredCvAsync", BindingFlags.NonPublic | BindingFlags.Instance);
|
|
Assert.NotNull(extractMethod);
|
|
Assert.NotNull(buildMethod);
|
|
|
|
var entries = new List<CvBenchmarkEntry>();
|
|
foreach (var path in files)
|
|
{
|
|
await using var stream = File.OpenRead(path);
|
|
var fileName = Path.GetFileName(path);
|
|
var formFile = new FormFile(stream, 0, stream.Length, "file", fileName)
|
|
{
|
|
Headers = new HeaderDictionary(),
|
|
ContentType = GuessContentType(path)
|
|
};
|
|
|
|
var extension = Path.GetExtension(path);
|
|
var extractTask = (Task<string>)extractMethod!.Invoke(null, new object[] { formFile, extension })!;
|
|
var text = await extractTask;
|
|
Assert.False(string.IsNullOrWhiteSpace(text));
|
|
|
|
var buildTask = (Task<StructuredCvProfile>)buildMethod!.Invoke(controller, new object[] { text, CancellationToken.None })!;
|
|
var structured = StructuredCvProfileJson.Normalize(await buildTask);
|
|
Assert.NotNull(structured);
|
|
|
|
var slug = Slugify(fileName);
|
|
var normalizedJson = StructuredCvProfileJson.Serialize(structured);
|
|
var outputPath = Path.Combine(outputsDir, $"{slug}.json");
|
|
await File.WriteAllTextAsync(outputPath, PrettyJson(normalizedJson));
|
|
|
|
var approvedPath = Path.Combine(approvedFixturesDir, $"{slug}.json");
|
|
var candidateFixturePath = Path.Combine(candidateFixturesDir, $"{slug}.json");
|
|
string? diffSummary = null;
|
|
var approvedExists = File.Exists(approvedPath);
|
|
if (approvedExists)
|
|
{
|
|
var approvedJson = await File.ReadAllTextAsync(approvedPath);
|
|
diffSummary = SummarizeDiff(approvedJson, normalizedJson);
|
|
}
|
|
else
|
|
{
|
|
await File.WriteAllTextAsync(candidateFixturePath, PrettyJson(normalizedJson));
|
|
diffSummary = "No approved fixture yet — candidate fixture written.";
|
|
}
|
|
|
|
entries.Add(new CvBenchmarkEntry(
|
|
FileName: fileName,
|
|
Slug: slug,
|
|
Extension: extension,
|
|
Characters: text.Length,
|
|
OutputPath: outputPath,
|
|
ApprovedFixturePath: approvedExists ? approvedPath : null,
|
|
CandidateFixturePath: approvedExists ? null : candidateFixturePath,
|
|
ContactLocation: structured.Contact.Location,
|
|
FirstJob: structured.Jobs.FirstOrDefault()?.Title,
|
|
FirstJobLocation: structured.Jobs.FirstOrDefault()?.Location,
|
|
FirstEducation: structured.Education.FirstOrDefault()?.Qualification,
|
|
FirstEducationLocation: structured.Education.FirstOrDefault()?.Location,
|
|
QualificationLevels: structured.Education.Select(x => x.QualificationLevel).Where(x => !string.IsNullOrWhiteSpace(x)).Cast<string>().ToList(),
|
|
SuspiciousLocations: structured.Jobs.Select(job => job.Location)
|
|
.Concat(structured.Education.Select(education => education.Location))
|
|
.Append(structured.Contact.Location)
|
|
.Where(value => !string.IsNullOrWhiteSpace(value))
|
|
.Cast<string>()
|
|
.Where(LooksSuspiciousLocation)
|
|
.ToList(),
|
|
CoverageScore: ComputeCoverageScore(structured),
|
|
ConfidenceScore: ComputeConfidenceScore(structured),
|
|
ConsistencyScore: ComputeConsistencyScore(structured),
|
|
DiffSummary: diffSummary
|
|
));
|
|
}
|
|
|
|
var summary = new CvBenchmarkSummary(
|
|
CorpusRoot,
|
|
outputRoot,
|
|
DateTimeOffset.UtcNow,
|
|
entries.Count,
|
|
Math.Round(entries.Average(x => x.CoverageScore), 3),
|
|
Math.Round(entries.Average(x => x.ConfidenceScore), 3),
|
|
Math.Round(entries.Average(x => x.ConsistencyScore), 3),
|
|
entries.Count(x => x.SuspiciousLocations.Count > 0),
|
|
entries.Count(x => x.ApprovedFixturePath is null),
|
|
entries
|
|
);
|
|
|
|
var indexPath = Path.Combine(outputRoot, "index.json");
|
|
var reportPath = Path.Combine(outputRoot, "report.md");
|
|
await File.WriteAllTextAsync(indexPath, JsonSerializer.Serialize(summary, new JsonSerializerOptions { WriteIndented = true }));
|
|
await File.WriteAllTextAsync(reportPath, RenderMarkdownReport(summary));
|
|
|
|
Assert.True(entries.Count > 0);
|
|
}
|
|
|
|
private sealed record CvBenchmarkEntry(
|
|
string FileName,
|
|
string Slug,
|
|
string Extension,
|
|
int Characters,
|
|
string OutputPath,
|
|
string? ApprovedFixturePath,
|
|
string? CandidateFixturePath,
|
|
string? ContactLocation,
|
|
string? FirstJob,
|
|
string? FirstJobLocation,
|
|
string? FirstEducation,
|
|
string? FirstEducationLocation,
|
|
List<string> QualificationLevels,
|
|
List<string> SuspiciousLocations,
|
|
double CoverageScore,
|
|
double ConfidenceScore,
|
|
double ConsistencyScore,
|
|
string? DiffSummary);
|
|
|
|
private sealed record CvBenchmarkSummary(
|
|
string CorpusRoot,
|
|
string OutputRoot,
|
|
DateTimeOffset GeneratedAtUtc,
|
|
int TotalFiles,
|
|
double AverageCoverage,
|
|
double AverageConfidence,
|
|
double AverageConsistency,
|
|
int FilesWithSuspiciousLocations,
|
|
int MissingApprovedFixtures,
|
|
List<CvBenchmarkEntry> Entries);
|
|
|
|
private static string ResolveOutputRoot()
|
|
{
|
|
var configured = Environment.GetEnvironmentVariable("CV_BENCHMARK_OUTPUT_DIR");
|
|
if (!string.IsNullOrWhiteSpace(configured)) return configured.Trim();
|
|
return Path.Combine(Path.GetTempPath(), "jobtracker-cv-benchmark", DateTime.UtcNow.ToString("yyyyMMddHHmmss"));
|
|
}
|
|
|
|
private static string ResolveApprovedFixturesRoot(string outputRoot)
|
|
{
|
|
var configured = Environment.GetEnvironmentVariable("CV_BENCHMARK_APPROVED_DIR");
|
|
if (!string.IsNullOrWhiteSpace(configured)) return configured.Trim();
|
|
return Path.Combine(outputRoot, "approved-fixtures");
|
|
}
|
|
|
|
private static string PrettyJson(string normalizedJson)
|
|
{
|
|
using var doc = JsonDocument.Parse(normalizedJson);
|
|
return JsonSerializer.Serialize(doc.RootElement, new JsonSerializerOptions { WriteIndented = true });
|
|
}
|
|
|
|
private static string SummarizeDiff(string approvedJson, string actualJson)
|
|
{
|
|
if (JsonDocument.Parse(approvedJson).RootElement.ToString() == JsonDocument.Parse(actualJson).RootElement.ToString())
|
|
{
|
|
return "Matches approved fixture.";
|
|
}
|
|
|
|
var approvedHash = Hash(approvedJson);
|
|
var actualHash = Hash(actualJson);
|
|
return $"Fixture differs (approved {approvedHash[..8]}, actual {actualHash[..8]}).";
|
|
}
|
|
|
|
private static string Hash(string value) => Convert.ToHexString(SHA256.HashData(System.Text.Encoding.UTF8.GetBytes(value))).ToLowerInvariant();
|
|
|
|
private static double ComputeCoverageScore(StructuredCvProfile structured)
|
|
{
|
|
var signals = new[]
|
|
{
|
|
!string.IsNullOrWhiteSpace(structured.Contact.FullName),
|
|
!string.IsNullOrWhiteSpace(structured.Contact.Email),
|
|
!string.IsNullOrWhiteSpace(structured.Contact.Location),
|
|
structured.Summary.Count > 0,
|
|
structured.Skills.Count > 0,
|
|
structured.Jobs.Count > 0,
|
|
structured.Education.Count > 0,
|
|
structured.Certifications.Count > 0 || structured.Projects.Count > 0 || structured.OtherSections.Count > 0,
|
|
};
|
|
return signals.Count(x => x) / (double)signals.Length;
|
|
}
|
|
|
|
private static double ComputeConfidenceScore(StructuredCvProfile structured)
|
|
{
|
|
var confidences = structured.Metadata.Fields.Values.Select(x => x.Confidence).Where(x => x.HasValue).Select(x => x!.Value).ToList();
|
|
return confidences.Count == 0 ? 0.55 : Math.Clamp(confidences.Average(), 0, 1);
|
|
}
|
|
|
|
private static double ComputeConsistencyScore(StructuredCvProfile structured)
|
|
{
|
|
var penalties = 0;
|
|
penalties += structured.Jobs.Count(job => LooksSuspiciousLocation(job.Location));
|
|
penalties += structured.Education.Count(education => LooksSuspiciousLocation(education.Location));
|
|
penalties += LooksSuspiciousLocation(structured.Contact.Location) ? 1 : 0;
|
|
penalties += structured.Education.Count(education => string.IsNullOrWhiteSpace(education.QualificationLevel) && !string.IsNullOrWhiteSpace(education.Qualification));
|
|
return Math.Max(0, 1 - (penalties * 0.12));
|
|
}
|
|
|
|
private static string RenderMarkdownReport(CvBenchmarkSummary summary)
|
|
{
|
|
var lines = new List<string>
|
|
{
|
|
"# CV benchmark report",
|
|
string.Empty,
|
|
$"- Generated: {summary.GeneratedAtUtc:O}",
|
|
$"- Corpus root: `{summary.CorpusRoot}`",
|
|
$"- Output root: `{summary.OutputRoot}`",
|
|
$"- Files: {summary.TotalFiles}",
|
|
$"- Average coverage: {summary.AverageCoverage:P0}",
|
|
$"- Average confidence: {summary.AverageConfidence:P0}",
|
|
$"- Average consistency: {summary.AverageConsistency:P0}",
|
|
$"- Files with suspicious locations: {summary.FilesWithSuspiciousLocations}",
|
|
$"- Missing approved fixtures: {summary.MissingApprovedFixtures}",
|
|
string.Empty,
|
|
"| File | Coverage | Confidence | Consistency | Suspicious locations | Fixture |",
|
|
"|---|---:|---:|---:|---:|---|",
|
|
};
|
|
|
|
lines.AddRange(summary.Entries.Select(entry =>
|
|
$"| {entry.FileName} | {entry.CoverageScore:P0} | {entry.ConfidenceScore:P0} | {entry.ConsistencyScore:P0} | {entry.SuspiciousLocations.Count} | {entry.DiffSummary} |"));
|
|
|
|
lines.Add(string.Empty);
|
|
lines.Add("## Notes");
|
|
lines.Add("- `outputs/*.json` contains the latest normalized parser output for each CV.");
|
|
lines.Add("- `candidate-fixtures/*.json` is created when no approved fixture exists yet.");
|
|
lines.Add("- To build a regression baseline, review a candidate fixture and copy it into the approved-fixtures directory used by the runner.");
|
|
return string.Join(Environment.NewLine, lines);
|
|
}
|
|
|
|
private static string Slugify(string value)
|
|
{
|
|
var cleaned = new string((value ?? string.Empty).ToLowerInvariant().Select(ch => char.IsLetterOrDigit(ch) ? ch : '-').ToArray());
|
|
while (cleaned.Contains("--", StringComparison.Ordinal)) cleaned = cleaned.Replace("--", "-", StringComparison.Ordinal);
|
|
return cleaned.Trim('-');
|
|
}
|
|
|
|
private static bool LooksSuspiciousLocation(string? value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value)) return false;
|
|
return value.Contains("Python", StringComparison.OrdinalIgnoreCase)
|
|
|| value.Contains("Ruby", StringComparison.OrdinalIgnoreCase)
|
|
|| value.Contains(" S A L E S ", StringComparison.OrdinalIgnoreCase)
|
|
|| value.Any(char.IsDigit);
|
|
}
|
|
|
|
private static string GuessContentType(string path)
|
|
{
|
|
return Path.GetExtension(path).ToLowerInvariant() switch
|
|
{
|
|
".pdf" => "application/pdf",
|
|
".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
".md" => "text/markdown",
|
|
_ => "text/plain"
|
|
};
|
|
}
|
|
|
|
private static AppPaths CreatePaths(string outputRoot)
|
|
{
|
|
var tempRoot = Path.Combine(Path.GetTempPath(), $"jobtracker-cv-corpus-{Guid.NewGuid():N}");
|
|
Directory.CreateDirectory(tempRoot);
|
|
|
|
var config = new ConfigurationBuilder()
|
|
.AddInMemoryCollection(new Dictionary<string, string?>
|
|
{
|
|
["Data:Root"] = tempRoot,
|
|
["Data:CvArtifactsRoot"] = Path.Combine(tempRoot, "CvArtifacts"),
|
|
["Data:CvBenchmarksRoot"] = outputRoot,
|
|
})
|
|
.Build();
|
|
|
|
var env = new Mock<IHostEnvironment>();
|
|
env.SetupGet(x => x.ContentRootPath).Returns(tempRoot);
|
|
return new AppPaths(config, env.Object);
|
|
}
|
|
}
|