using System.IO.Enumeration; using System.Reflection; using System.Security.Cryptography; using System.Text.Json; using JobTrackerApi.Controllers; using JobTrackerApi.Models; using JobTrackerApi.Services; using JobTrackerApi.Tests.TestSupport; using Microsoft.AspNetCore.Http; using Microsoft.AspNetCore.Identity; using Microsoft.AspNetCore.Mvc; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Hosting; using Moq; using Xunit; namespace JobTrackerApi.Tests; public sealed class CvCorpusHarnessTests { private static readonly string CorpusRoot = "/home/pi/cvs"; [Fact] public async Task Local_cv_corpus_harness_produces_repeatable_parse_report_when_available() { if (!Directory.Exists(CorpusRoot)) return; var ignoredPatterns = ResolveIgnoredPatterns(); var files = Directory.EnumerateFiles(CorpusRoot, "*.*", SearchOption.TopDirectoryOnly) .Where(path => path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) || path.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) || path.EndsWith(".txt", StringComparison.OrdinalIgnoreCase) || path.EndsWith(".md", StringComparison.OrdinalIgnoreCase)) .Where(path => !IsIgnoredFile(path, ignoredPatterns)) .OrderBy(path => path, StringComparer.OrdinalIgnoreCase) .ToList(); if (files.Count == 0) return; var outputRoot = ResolveOutputRoot(); var outputsDir = Path.Combine(outputRoot, "outputs"); var candidateFixturesDir = Path.Combine(outputRoot, "candidate-fixtures"); var approvedFixturesDir = ResolveApprovedFixturesRoot(outputRoot); Directory.CreateDirectory(outputRoot); Directory.CreateDirectory(outputsDir); Directory.CreateDirectory(candidateFixturesDir); Directory.CreateDirectory(approvedFixturesDir); var user = new ApplicationUser { Id = "user-1", ProfileCvText = "seed" }; var userManager = TestHostFactory.CreateUserManager(); userManager.Setup(x => x.GetUserAsync(It.IsAny())).ReturnsAsync(user); userManager.Setup(x => x.UpdateAsync(It.IsAny())).ReturnsAsync(IdentityResult.Success); var aiService = new Mock(); aiService.Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny(), 3200, 900)).ReturnsAsync(string.Empty); aiService.Setup(x => x.SummarizeSectionAsync(It.Is(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), It.IsAny(), 2800, 900)).ReturnsAsync((string _, string text, int _, int __) => text); var cvAiNormalizer = CreateCvAiNormalizerFromEnvironment(); await using var db = TestHostFactory.CreateInMemoryDb(); var paths = CreatePaths(outputRoot); var controller = new ProfileCvController(userManager.Object, aiService.Object, db, paths, null, NoOpCvAiClassifier.Instance, cvAiNormalizer) { ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() } }; var extractMethod = typeof(ProfileCvController).GetMethod("ExtractTextAsync", BindingFlags.NonPublic | BindingFlags.Static); var reconstructMethod = typeof(ProfileCvController).GetMethod("MaybeReconstructStructuredCvAsync", BindingFlags.NonPublic | BindingFlags.Instance); var buildMethod = typeof(ProfileCvController).GetMethod("BuildStructuredCvAsync", BindingFlags.NonPublic | BindingFlags.Instance); Assert.NotNull(extractMethod); Assert.NotNull(reconstructMethod); Assert.NotNull(buildMethod); var entries = new List(); foreach (var path in files) { await using var stream = File.OpenRead(path); var fileName = Path.GetFileName(path); var formFile = new FormFile(stream, 0, stream.Length, "file", fileName) { Headers = new HeaderDictionary(), ContentType = GuessContentType(path) }; var extension = Path.GetExtension(path); var extractTask = (Task)extractMethod!.Invoke(null, new object[] { formFile, extension })!; var text = await extractTask; Assert.False(string.IsNullOrWhiteSpace(text)); var reconstructTask = (Task)reconstructMethod!.Invoke(controller, new object[] { text, CancellationToken.None })!; var normalizedText = await reconstructTask; Assert.False(string.IsNullOrWhiteSpace(normalizedText)); var buildTask = (Task)buildMethod!.Invoke(controller, new object[] { normalizedText, CancellationToken.None })!; var structured = StructuredCvProfileJson.Normalize(await buildTask); Assert.NotNull(structured); var slug = Slugify(fileName); var normalizedJson = StructuredCvProfileJson.Serialize(structured); var outputPath = Path.Combine(outputsDir, $"{slug}.json"); await File.WriteAllTextAsync(outputPath, PrettyJson(normalizedJson)); var approvedPath = Path.Combine(approvedFixturesDir, $"{slug}.json"); var candidateFixturePath = Path.Combine(candidateFixturesDir, $"{slug}.json"); string? diffSummary = null; var approvedExists = File.Exists(approvedPath); if (approvedExists) { var approvedJson = await File.ReadAllTextAsync(approvedPath); diffSummary = SummarizeDiff(approvedJson, normalizedJson); } else { await File.WriteAllTextAsync(candidateFixturePath, PrettyJson(normalizedJson)); diffSummary = "No approved fixture yet — candidate fixture written."; } entries.Add(new CvBenchmarkEntry( FileName: fileName, Slug: slug, Extension: extension, Characters: text.Length, OutputPath: outputPath, ApprovedFixturePath: approvedExists ? approvedPath : null, CandidateFixturePath: approvedExists ? null : candidateFixturePath, ContactLocation: structured.Contact.Location, FirstJob: structured.Jobs.FirstOrDefault()?.Title, FirstJobLocation: structured.Jobs.FirstOrDefault()?.Location, FirstEducation: structured.Education.FirstOrDefault()?.Qualification, FirstEducationLocation: structured.Education.FirstOrDefault()?.Location, QualificationLevels: structured.Education.Select(x => x.QualificationLevel).Where(x => !string.IsNullOrWhiteSpace(x)).Cast().ToList(), SuspiciousLocations: structured.Jobs.Select(job => job.Location) .Concat(structured.Education.Select(education => education.Location)) .Append(structured.Contact.Location) .Where(value => !string.IsNullOrWhiteSpace(value)) .Cast() .Where(LooksSuspiciousLocation) .ToList(), CoverageScore: ComputeCoverageScore(structured), ConfidenceScore: ComputeConfidenceScore(structured), ConsistencyScore: ComputeConsistencyScore(structured), DiffSummary: diffSummary )); } var summary = new CvBenchmarkSummary( CorpusRoot, outputRoot, DateTimeOffset.UtcNow, entries.Count, Math.Round(entries.Average(x => x.CoverageScore), 3), Math.Round(entries.Average(x => x.ConfidenceScore), 3), Math.Round(entries.Average(x => x.ConsistencyScore), 3), entries.Count(x => x.SuspiciousLocations.Count > 0), entries.Count(x => x.ApprovedFixturePath is null), entries ); var indexPath = Path.Combine(outputRoot, "index.json"); var reportPath = Path.Combine(outputRoot, "report.md"); await File.WriteAllTextAsync(indexPath, JsonSerializer.Serialize(summary, new JsonSerializerOptions { WriteIndented = true })); await File.WriteAllTextAsync(reportPath, RenderMarkdownReport(summary)); Assert.True(entries.Count > 0); } private sealed record CvBenchmarkEntry( string FileName, string Slug, string Extension, int Characters, string OutputPath, string? ApprovedFixturePath, string? CandidateFixturePath, string? ContactLocation, string? FirstJob, string? FirstJobLocation, string? FirstEducation, string? FirstEducationLocation, List QualificationLevels, List SuspiciousLocations, double CoverageScore, double ConfidenceScore, double ConsistencyScore, string? DiffSummary); private sealed record CvBenchmarkSummary( string CorpusRoot, string OutputRoot, DateTimeOffset GeneratedAtUtc, int TotalFiles, double AverageCoverage, double AverageConfidence, double AverageConsistency, int FilesWithSuspiciousLocations, int MissingApprovedFixtures, List Entries); private static string ResolveOutputRoot() { var configured = Environment.GetEnvironmentVariable("CV_BENCHMARK_OUTPUT_DIR"); if (!string.IsNullOrWhiteSpace(configured)) return configured.Trim(); return Path.Combine(Path.GetTempPath(), "jobtracker-cv-benchmark", DateTime.UtcNow.ToString("yyyyMMddHHmmss")); } private static string ResolveApprovedFixturesRoot(string outputRoot) { var configured = Environment.GetEnvironmentVariable("CV_BENCHMARK_APPROVED_DIR"); if (!string.IsNullOrWhiteSpace(configured)) return configured.Trim(); return Path.Combine(outputRoot, "approved-fixtures"); } private static List ResolveIgnoredPatterns() { var configured = Environment.GetEnvironmentVariable("CV_BENCHMARK_IGNORE"); if (string.IsNullOrWhiteSpace(configured)) return new List(); return configured .Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) .Where(value => !string.IsNullOrWhiteSpace(value)) .ToList(); } private static bool IsIgnoredFile(string path, List ignoredPatterns) { if (ignoredPatterns.Count == 0) return false; var fileName = Path.GetFileName(path); foreach (var pattern in ignoredPatterns) { if (FileSystemName.MatchesSimpleExpression(pattern, fileName, ignoreCase: true)) { return true; } } return false; } private static string PrettyJson(string normalizedJson) { using var doc = JsonDocument.Parse(normalizedJson); return JsonSerializer.Serialize(doc.RootElement, new JsonSerializerOptions { WriteIndented = true }); } private static string SummarizeDiff(string approvedJson, string actualJson) { if (JsonDocument.Parse(approvedJson).RootElement.ToString() == JsonDocument.Parse(actualJson).RootElement.ToString()) { return "Matches approved fixture."; } var approvedHash = Hash(approvedJson); var actualHash = Hash(actualJson); return $"Fixture differs (approved {approvedHash[..8]}, actual {actualHash[..8]})."; } private static string Hash(string value) => Convert.ToHexString(SHA256.HashData(System.Text.Encoding.UTF8.GetBytes(value))).ToLowerInvariant(); private static double ComputeCoverageScore(StructuredCvProfile structured) { var signals = new[] { !string.IsNullOrWhiteSpace(structured.Contact.FullName), !string.IsNullOrWhiteSpace(structured.Contact.Email), !string.IsNullOrWhiteSpace(structured.Contact.Location), structured.Summary.Count > 0, structured.Skills.Count > 0, structured.Jobs.Count > 0, structured.Education.Count > 0, structured.Certifications.Count > 0 || structured.Projects.Count > 0 || structured.OtherSections.Count > 0, }; return signals.Count(x => x) / (double)signals.Length; } private static double ComputeConfidenceScore(StructuredCvProfile structured) { var confidences = structured.Metadata.Fields.Values.Select(x => x.Confidence).Where(x => x.HasValue).Select(x => x!.Value).ToList(); return confidences.Count == 0 ? 0.55 : Math.Clamp(confidences.Average(), 0, 1); } private static double ComputeConsistencyScore(StructuredCvProfile structured) { var penalties = 0; penalties += structured.Jobs.Count(job => LooksSuspiciousLocation(job.Location)); penalties += structured.Education.Count(education => LooksSuspiciousLocation(education.Location)); penalties += LooksSuspiciousLocation(structured.Contact.Location) ? 1 : 0; penalties += structured.Education.Count(education => string.IsNullOrWhiteSpace(education.QualificationLevel) && !string.IsNullOrWhiteSpace(education.Qualification)); return Math.Max(0, 1 - (penalties * 0.12)); } private static string RenderMarkdownReport(CvBenchmarkSummary summary) { var lines = new List { "# CV benchmark report", string.Empty, $"- Generated: {summary.GeneratedAtUtc:O}", $"- Corpus root: `{summary.CorpusRoot}`", $"- Output root: `{summary.OutputRoot}`", $"- Files: {summary.TotalFiles}", $"- Average coverage: {summary.AverageCoverage:P0}", $"- Average confidence: {summary.AverageConfidence:P0}", $"- Average consistency: {summary.AverageConsistency:P0}", $"- Files with suspicious locations: {summary.FilesWithSuspiciousLocations}", $"- Missing approved fixtures: {summary.MissingApprovedFixtures}", string.Empty, "| File | Coverage | Confidence | Consistency | Suspicious locations | Fixture |", "|---|---:|---:|---:|---:|---|", }; lines.AddRange(summary.Entries.Select(entry => $"| {entry.FileName} | {entry.CoverageScore:P0} | {entry.ConfidenceScore:P0} | {entry.ConsistencyScore:P0} | {entry.SuspiciousLocations.Count} | {entry.DiffSummary} |")); lines.Add(string.Empty); lines.Add("## Notes"); lines.Add("- `outputs/*.json` contains the latest normalized parser output for each CV."); lines.Add("- `candidate-fixtures/*.json` is created when no approved fixture exists yet."); lines.Add("- To build a regression baseline, review a candidate fixture and copy it into the approved-fixtures directory used by the runner."); return string.Join(Environment.NewLine, lines); } private static string Slugify(string value) { var cleaned = new string((value ?? string.Empty).ToLowerInvariant().Select(ch => char.IsLetterOrDigit(ch) ? ch : '-').ToArray()); while (cleaned.Contains("--", StringComparison.Ordinal)) cleaned = cleaned.Replace("--", "-", StringComparison.Ordinal); return cleaned.Trim('-'); } private static bool LooksSuspiciousLocation(string? value) { if (string.IsNullOrWhiteSpace(value)) return false; return value.Contains("Python", StringComparison.OrdinalIgnoreCase) || value.Contains("Ruby", StringComparison.OrdinalIgnoreCase) || value.Contains(" S A L E S ", StringComparison.OrdinalIgnoreCase) || value.Any(char.IsDigit); } private static string GuessContentType(string path) { return Path.GetExtension(path).ToLowerInvariant() switch { ".pdf" => "application/pdf", ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ".md" => "text/markdown", _ => "text/plain" }; } private static AppPaths CreatePaths(string outputRoot) { var tempRoot = Path.Combine(Path.GetTempPath(), $"jobtracker-cv-corpus-{Guid.NewGuid():N}"); Directory.CreateDirectory(tempRoot); var config = new ConfigurationBuilder() .AddInMemoryCollection(new Dictionary { ["Data:Root"] = tempRoot, ["Data:CvArtifactsRoot"] = Path.Combine(tempRoot, "CvArtifacts"), ["Data:CvBenchmarksRoot"] = outputRoot, }) .Build(); var env = new Mock(); env.SetupGet(x => x.ContentRootPath).Returns(tempRoot); return new AppPaths(config, env.Object); } private static ICvAiNormalizer CreateCvAiNormalizerFromEnvironment() { var baseUrl = Environment.GetEnvironmentVariable("CV_AI_BASE_URL"); if (string.IsNullOrWhiteSpace(baseUrl)) return NoOpCvAiNormalizer.Instance; var services = new Microsoft.Extensions.DependencyInjection.ServiceCollection(); services.AddHttpClient("ai-service", client => { client.BaseAddress = new Uri(baseUrl.Trim()); client.Timeout = TimeSpan.FromSeconds(180); }); var provider = services.BuildServiceProvider(); var factory = provider.GetRequiredService(); return new CvAiNormalizer(factory); } }