refactor, security updates, cv extraction upgrades

This commit is contained in:
2026-04-11 01:34:32 +02:00
parent 806b200ac5
commit 27fd70a2d7
59 changed files with 6817 additions and 1561 deletions
+55 -2
View File
@@ -1,3 +1,4 @@
using System.IO.Enumeration;
using System.Reflection;
using System.Security.Cryptography;
using System.Text.Json;
@@ -24,11 +25,13 @@ public sealed class CvCorpusHarnessTests
{
if (!Directory.Exists(CorpusRoot)) return;
var ignoredPatterns = ResolveIgnoredPatterns();
var files = Directory.EnumerateFiles(CorpusRoot, "*.*", SearchOption.TopDirectoryOnly)
.Where(path => path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".docx", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".txt", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".md", StringComparison.OrdinalIgnoreCase))
.Where(path => !IsIgnoredFile(path, ignoredPatterns))
.OrderBy(path => path, StringComparer.OrdinalIgnoreCase)
.ToList();
@@ -51,17 +54,20 @@ public sealed class CvCorpusHarnessTests
var aiService = new Mock<ISummarizerService>();
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Extract this CV into structured JSON", StringComparison.Ordinal)), It.IsAny<string>(), 3200, 900)).ReturnsAsync(string.Empty);
aiService.Setup(x => x.SummarizeSectionAsync(It.Is<string>(instruction => instruction.Contains("Reconstruct this CV text extracted from a PDF", StringComparison.Ordinal)), It.IsAny<string>(), 2800, 900)).ReturnsAsync((string _, string text, int _, int __) => text);
var cvAiNormalizer = CreateCvAiNormalizerFromEnvironment();
await using var db = TestHostFactory.CreateInMemoryDb();
var paths = CreatePaths(outputRoot);
var controller = new ProfileCvController(userManager.Object, aiService.Object, db, paths, null, NoOpCvAiClassifier.Instance)
var controller = new ProfileCvController(userManager.Object, aiService.Object, db, paths, null, NoOpCvAiClassifier.Instance, cvAiNormalizer)
{
ControllerContext = new ControllerContext { HttpContext = new DefaultHttpContext() }
};
var extractMethod = typeof(ProfileCvController).GetMethod("ExtractTextAsync", BindingFlags.NonPublic | BindingFlags.Static);
var reconstructMethod = typeof(ProfileCvController).GetMethod("MaybeReconstructStructuredCvAsync", BindingFlags.NonPublic | BindingFlags.Instance);
var buildMethod = typeof(ProfileCvController).GetMethod("BuildStructuredCvAsync", BindingFlags.NonPublic | BindingFlags.Instance);
Assert.NotNull(extractMethod);
Assert.NotNull(reconstructMethod);
Assert.NotNull(buildMethod);
var entries = new List<CvBenchmarkEntry>();
@@ -80,7 +86,11 @@ public sealed class CvCorpusHarnessTests
var text = await extractTask;
Assert.False(string.IsNullOrWhiteSpace(text));
var buildTask = (Task<StructuredCvProfile>)buildMethod!.Invoke(controller, new object[] { text, CancellationToken.None })!;
var reconstructTask = (Task<string>)reconstructMethod!.Invoke(controller, new object[] { text, CancellationToken.None })!;
var normalizedText = await reconstructTask;
Assert.False(string.IsNullOrWhiteSpace(normalizedText));
var buildTask = (Task<StructuredCvProfile>)buildMethod!.Invoke(controller, new object[] { normalizedText, CancellationToken.None })!;
var structured = StructuredCvProfileJson.Normalize(await buildTask);
Assert.NotNull(structured);
@@ -199,6 +209,33 @@ public sealed class CvCorpusHarnessTests
return Path.Combine(outputRoot, "approved-fixtures");
}
private static List<string> ResolveIgnoredPatterns()
{
var configured = Environment.GetEnvironmentVariable("CV_BENCHMARK_IGNORE");
if (string.IsNullOrWhiteSpace(configured)) return new List<string>();
return configured
.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
.Where(value => !string.IsNullOrWhiteSpace(value))
.ToList();
}
private static bool IsIgnoredFile(string path, List<string> ignoredPatterns)
{
if (ignoredPatterns.Count == 0) return false;
var fileName = Path.GetFileName(path);
foreach (var pattern in ignoredPatterns)
{
if (FileSystemName.MatchesSimpleExpression(pattern, fileName, ignoreCase: true))
{
return true;
}
}
return false;
}
private static string PrettyJson(string normalizedJson)
{
using var doc = JsonDocument.Parse(normalizedJson);
@@ -327,4 +364,20 @@ public sealed class CvCorpusHarnessTests
env.SetupGet(x => x.ContentRootPath).Returns(tempRoot);
return new AppPaths(config, env.Object);
}
private static ICvAiNormalizer CreateCvAiNormalizerFromEnvironment()
{
var baseUrl = Environment.GetEnvironmentVariable("CV_AI_BASE_URL");
if (string.IsNullOrWhiteSpace(baseUrl)) return NoOpCvAiNormalizer.Instance;
var services = new Microsoft.Extensions.DependencyInjection.ServiceCollection();
services.AddHttpClient("ai-service", client =>
{
client.BaseAddress = new Uri(baseUrl.Trim());
client.Timeout = TimeSpan.FromSeconds(180);
});
var provider = services.BuildServiceProvider();
var factory = provider.GetRequiredService<System.Net.Http.IHttpClientFactory>();
return new CvAiNormalizer(factory);
}
}