Improve CV parsing and profile editor flow

This commit is contained in:
2026-03-29 14:29:18 +02:00
parent 99fc94bc18
commit 44000f96f2
18 changed files with 1028 additions and 44 deletions
@@ -124,6 +124,10 @@ public sealed class AdminSystemController : ControllerBase
GpuName: null,
OcrAvailable: false,
OcrLanguages: null,
OllamaConfigured: null,
OllamaReachable: null,
OllamaModel: null,
OllamaModelAvailable: null,
HealthLatencyMs: null,
ProbeLatencyMs: null,
LastProbeAt: null,
@@ -61,13 +61,15 @@ public sealed class ProfileCvController : ControllerBase
private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
private readonly ICvAiClassifier _cvAiClassifier;
private readonly JobTrackerContext _db;
private readonly AppPaths _paths;
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths)
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null)
{
_users = users;
_aiService = aiService;
_cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance;
_db = db;
_paths = paths;
}
@@ -338,14 +340,7 @@ public sealed class ProfileCvController : ControllerBase
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var parseSource = NormalizeTextForStructuredParsing(text);
var fallbackSections = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
AnnotateStructuredCv(sectionFallback, "repair", 0.56);
@@ -729,12 +724,19 @@ public sealed class ProfileCvController : ControllerBase
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
{
var languages = new List<StructuredCvLanguage>();
foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+")
.Select(item => item.Trim())
.Where(item => item.Length > 1);
foreach (var candidate in candidates)
{
var name = NullIfWhitespace(match.Groups[1].Value);
var level = NullIfWhitespace(match.Groups[2].Value);
if (name is null) continue;
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
var level = HumanLanguageCatalog.ExtractLevel(candidate);
if (level is null) continue;
foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate))
{
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
}
}
return languages
@@ -872,6 +874,86 @@ public sealed class ProfileCvController : ControllerBase
.ToList();
}
private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var parsed = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
if (hasRealSections) return parsed;
var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
return aiSections.Count > 0 ? aiSections : parsed;
}
private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length >= 24)
.ToList();
if (blocks.Count == 0) return new List<StructuredCvSection>();
var sectionBuckets = new List<StructuredCvSection>();
foreach (var block in blocks)
{
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
var sectionName = classification?.Section;
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
{
sectionName = canonical;
}
if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase))
{
sectionName = "General";
}
var content = block;
if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null)
{
var lines = new List<string>();
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase);
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent);
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
if (classification.Bullets is not null)
{
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
}
if (lines.Count > 0) content = string.Join("\n", lines);
}
var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
if (existing is null)
{
sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
}
else
{
existing.Content = $"{existing.Content}\n\n{content}".Trim();
existing.WordCount = CountWords(existing.Content);
}
}
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
}
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
{
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
if (string.IsNullOrWhiteSpace(start)) return end;
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
}
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var normalized = text.Trim();
+1
View File
@@ -132,6 +132,7 @@ builder.Services.AddHttpClient("ai-service", client =>
builder.Services.AddMemoryCache();
builder.Services.AddSingleton<ISummarizerService, SummarizerService>();
builder.Services.AddSingleton<ICvAiClassifier, CvAiClassifier>();
builder.Services.AddSingleton<IGoogleTokenValidator, GoogleTokenValidator>();
builder.Services.AddScoped<IGmailOAuthService, GmailOAuthService>();
+65
View File
@@ -0,0 +1,65 @@
using System.Net.Http;
using System.Text;
using System.Text.Json;
namespace JobTrackerApi.Services;
public sealed record CvBlockClassificationResult(
string? Section,
double? Confidence,
string? Reason,
string? Title,
string? Company,
string? Location,
string? Start,
string? End,
List<string>? Bullets);
public interface ICvAiClassifier
{
Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default);
}
public sealed class CvAiClassifier : ICvAiClassifier
{
private readonly IHttpClientFactory _httpClientFactory;
public CvAiClassifier(IHttpClientFactory httpClientFactory)
{
_httpClientFactory = httpClientFactory;
}
public async Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(block)) return null;
try
{
var client = _httpClientFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { block });
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
using var response = await client.PostAsync("/cv/classify-block", content, cancellationToken);
if (!response.IsSuccessStatusCode) return null;
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
var parsed = await JsonSerializer.DeserializeAsync<CvBlockClassificationResult>(stream, new JsonSerializerOptions(JsonSerializerDefaults.Web)
{
PropertyNameCaseInsensitive = true
}, cancellationToken);
return parsed;
}
catch
{
return null;
}
}
}
public sealed class NoOpCvAiClassifier : ICvAiClassifier
{
public static NoOpCvAiClassifier Instance { get; } = new();
private NoOpCvAiClassifier() { }
public Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
=> Task.FromResult<CvBlockClassificationResult?>(null);
}
@@ -21,6 +21,10 @@ namespace JobTrackerApi.Services
string? GpuName,
bool? OcrAvailable,
string? OcrLanguages,
bool? OllamaConfigured,
bool? OllamaReachable,
string? OllamaModel,
bool? OllamaModelAvailable,
double? HealthLatencyMs,
double? ProbeLatencyMs,
DateTimeOffset? LastProbeAt,
@@ -310,6 +314,10 @@ namespace JobTrackerApi.Services
string? gpuName = null;
bool? ocrAvailable = null;
string? ocrLanguages = null;
bool? ollamaConfigured = null;
bool? ollamaReachable = null;
string? ollamaModel = null;
bool? ollamaModelAvailable = null;
double? healthLatencyMs = null;
var healthy = false;
string? healthError = null;
@@ -332,6 +340,10 @@ namespace JobTrackerApi.Services
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
if (doc.RootElement.TryGetProperty("ollama_configured", out var ollamaConfiguredEl) && ollamaConfiguredEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaConfigured = ollamaConfiguredEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ollama_reachable", out var ollamaReachableEl) && ollamaReachableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaReachable = ollamaReachableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ollama_model", out var ollamaModelEl)) ollamaModel = ollamaModelEl.GetString();
if (doc.RootElement.TryGetProperty("ollama_model_available", out var ollamaModelAvailableEl) && ollamaModelAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaModelAvailable = ollamaModelAvailableEl.GetBoolean();
}
else
{
@@ -390,6 +402,10 @@ namespace JobTrackerApi.Services
GpuName: gpuName,
OcrAvailable: ocrAvailable,
OcrLanguages: ocrLanguages,
OllamaConfigured: ollamaConfigured,
OllamaReachable: ollamaReachable,
OllamaModel: ollamaModel,
OllamaModelAvailable: ollamaModelAvailable,
HealthLatencyMs: healthLatencyMs,
ProbeLatencyMs: probeLatencyMs,
LastProbeAt: lastProbeAt,