From 44000f96f20845cc3bf29bc3dbfcba54231e2b0e Mon Sep 17 00:00:00 2001 From: cesnimda Date: Sun, 29 Mar 2026 14:29:18 +0200 Subject: [PATCH] Improve CV parsing and profile editor flow --- .env.example | 3 + .../ProfileCvControllerTests.cs | 157 ++++++++++++- .../Controllers/AdminSystemController.cs | 4 + .../Controllers/ProfileCvController.cs | 110 +++++++-- JobTrackerApi/Program.cs | 1 + JobTrackerApi/Services/CvAiClassifier.cs | 65 ++++++ JobTrackerApi/Services/SummarizerService.cs | 16 ++ Models/HumanLanguageCatalog.cs | 162 +++++++++++++ Models/StructuredCvProfileJson.cs | 217 +++++++++++++++++- README.md | 10 +- deploy/README.md | 6 +- deploy/deploy.sh | 5 + docker-compose.yml | 26 +++ job-tracker-ui/src/pages/ProfilePage.tsx | 47 ++-- job-tracker-ui/src/profile-page.test.tsx | 7 + scripts/start-ollama-cv.sh | 79 +++++++ tools/summarizer/README.md | 27 ++- tools/summarizer/app.py | 130 +++++++++++ 18 files changed, 1028 insertions(+), 44 deletions(-) create mode 100644 JobTrackerApi/Services/CvAiClassifier.cs create mode 100644 Models/HumanLanguageCatalog.cs create mode 100755 scripts/start-ollama-cv.sh diff --git a/.env.example b/.env.example index 66bfefa..6c6c2c9 100644 --- a/.env.example +++ b/.env.example @@ -9,6 +9,9 @@ GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET # Optional. If omitted, the backend uses https:///api/gmail/oauth/callback GOOGLE_GMAIL_REDIRECT_URI= AI_SERVICE_BASE_URL=http://ai-service:8001 +# Optional: enables hybrid CV block classification in the local AI service. +OLLAMA_BASE_URL=http://ollama:11434 +OLLAMA_MODEL=qwen2.5:7b # Optional: only needed if you want the UI to call a non-default API base URL. # In production the UI defaults to `/api`. diff --git a/JobTrackerApi.Tests/ProfileCvControllerTests.cs b/JobTrackerApi.Tests/ProfileCvControllerTests.cs index 8b5c185..8ca17e8 100644 --- a/JobTrackerApi.Tests/ProfileCvControllerTests.cs +++ b/JobTrackerApi.Tests/ProfileCvControllerTests.cs @@ -280,7 +280,7 @@ public sealed class ProfileCvControllerTests [Fact] public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid() { - var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1."; + var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1, C#, SQL, and public speaking."; var user = new ApplicationUser { Id = "user-1" }; var userManager = CreateUserManager(); @@ -320,9 +320,164 @@ public sealed class ProfileCvControllerTests Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase)); Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase)); Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase)); + Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("C#", StringComparison.OrdinalIgnoreCase)); + Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("SQL", StringComparison.OrdinalIgnoreCase)); + Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Contains("public speaking", StringComparison.OrdinalIgnoreCase)); Assert.DoesNotContain(structured.Sections, section => section.Name == "General"); } + [Fact] + public void Structured_cv_normalization_keeps_human_languages_and_drops_skill_noise() + { + var structured = StructuredCvProfileJson.Deserialize(""" + { + "version": "1", + "contact": {}, + "summary": [], + "jobs": [], + "education": [], + "skills": [], + "languages": [ + { "name": "English", "level": "Native" }, + { "name": "Native Norwegian speaker", "level": null }, + { "name": "French", "level": null }, + { "name": "C#", "level": "Advanced" }, + { "name": "Leadership", "level": null } + ], + "interests": [], + "otherSections": [] + } + """); + + Assert.Collection( + structured.Languages.OrderBy(item => item.Name, StringComparer.OrdinalIgnoreCase), + first => + { + Assert.Equal("English", first.Name); + Assert.Equal("Native", first.Level); + }, + second => + { + Assert.Equal("Norwegian", second.Name); + Assert.Equal("Native", second.Level); + }); + } + + [Fact] + public void Structured_cv_normalization_separates_job_title_company_and_tasks() + { + var structured = StructuredCvProfileJson.Deserialize(""" + { + "version": "1", + "contact": {}, + "summary": [], + "jobs": [ + { + "title": "Acme Ltd", + "company": "Senior Backend Developer", + "location": "Oslo", + "start": "2022", + "end": "2024", + "isCurrent": false, + "bullets": [ + "Senior Backend Developer", + "Acme Ltd", + "2022 - 2024", + "Built API integrations for recruiter workflows and reduced manual follow-up churn." + ], + "skills": [".NET", "SQL"] + }, + { + "title": "Lead Engineer at Northwind Council", + "company": null, + "location": "Remote", + "start": "2020", + "end": "Present", + "isCurrent": true, + "bullets": [ + "Led platform delivery across case-management and reporting surfaces.", + "Skills: C#, SQL" + ], + "skills": ["C#", "SQL"] + } + ], + "education": [], + "skills": [], + "languages": [], + "interests": [], + "otherSections": [] + } + """); + + Assert.Collection( + structured.Jobs, + first => + { + Assert.Equal("Senior Backend Developer", first.Title); + Assert.Equal("Acme Ltd", first.Company); + Assert.Equal(new[] { "Built API integrations for recruiter workflows and reduced manual follow-up churn." }, first.Bullets); + }, + second => + { + Assert.Equal("Lead Engineer", second.Title); + Assert.Equal("Northwind Council", second.Company); + Assert.Equal(new[] { "Led platform delivery across case-management and reporting surfaces." }, second.Bullets); + }); + } + + [Fact] + public void Structured_cv_normalization_hardens_contact_links_locations_and_dates() + { + var structured = StructuredCvProfileJson.Deserialize(""" + { + "version": "1", + "contact": { + "location": "Tønsberg, Norway", + "website": "https://cesnimda.co.uk/about", + "linkedin": "linkedin.com/in/demo-user?trk=foo" + }, + "summary": [], + "jobs": [ + { + "title": "System Developer", + "company": "Warwickshire County Council", + "location": "Warwickshire, England, UK", + "start": "Sept 2023", + "end": "1/1/2024", + "isCurrent": false, + "bullets": ["Built APIs"], + "skills": [] + }, + { + "title": "Developer", + "company": "Demo Co", + "location": "Remote 123", + "start": "Spring 2024", + "end": "Later", + "isCurrent": false, + "bullets": ["Kept services running"], + "skills": [] + } + ], + "education": [], + "skills": [], + "languages": [], + "interests": [], + "otherSections": [] + } + """); + + Assert.Equal("Tønsberg, Norway", structured.Contact.Location); + Assert.Equal("cesnimda.co.uk", structured.Contact.Website); + Assert.Equal("https://www.linkedin.com/in/demo-user", structured.Contact.LinkedIn); + Assert.Equal("Warwickshire, England, UK", structured.Jobs[0].Location); + Assert.Equal("Sept 2023", structured.Jobs[0].Start); + Assert.Equal("1/1/2024", structured.Jobs[0].End); + Assert.Null(structured.Jobs[1].Location); + Assert.Null(structured.Jobs[1].Start); + Assert.Null(structured.Jobs[1].End); + } + [Fact] public async Task Parse_returns_structured_cv_and_persists_it() { diff --git a/JobTrackerApi/Controllers/AdminSystemController.cs b/JobTrackerApi/Controllers/AdminSystemController.cs index 4c81362..428ae50 100644 --- a/JobTrackerApi/Controllers/AdminSystemController.cs +++ b/JobTrackerApi/Controllers/AdminSystemController.cs @@ -124,6 +124,10 @@ public sealed class AdminSystemController : ControllerBase GpuName: null, OcrAvailable: false, OcrLanguages: null, + OllamaConfigured: null, + OllamaReachable: null, + OllamaModel: null, + OllamaModelAvailable: null, HealthLatencyMs: null, ProbeLatencyMs: null, LastProbeAt: null, diff --git a/JobTrackerApi/Controllers/ProfileCvController.cs b/JobTrackerApi/Controllers/ProfileCvController.cs index 0a74b51..e5259a0 100644 --- a/JobTrackerApi/Controllers/ProfileCvController.cs +++ b/JobTrackerApi/Controllers/ProfileCvController.cs @@ -61,13 +61,15 @@ public sealed class ProfileCvController : ControllerBase private readonly UserManager _users; private readonly ISummarizerService _aiService; + private readonly ICvAiClassifier _cvAiClassifier; private readonly JobTrackerContext _db; private readonly AppPaths _paths; - public ProfileCvController(UserManager users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths) + public ProfileCvController(UserManager users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null) { _users = users; _aiService = aiService; + _cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance; _db = db; _paths = paths; } @@ -338,14 +340,7 @@ public sealed class ProfileCvController : ControllerBase private async Task BuildStructuredCvAsync(string text, CancellationToken cancellationToken) { var parseSource = NormalizeTextForStructuredParsing(text); - var fallbackSections = ParseSections(parseSource) - .Select(section => new StructuredCvSection - { - Name = section.Name, - Content = section.Content, - WordCount = CountWords(section.Content), - }) - .ToList(); + var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken); var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections); AnnotateStructuredCv(sectionFallback, "repair", 0.56); @@ -729,12 +724,19 @@ public sealed class ProfileCvController : ControllerBase private static List ParseLanguagesHeuristically(string content) { var languages = new List(); - foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase)) + var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+") + .Select(item => item.Trim()) + .Where(item => item.Length > 1); + + foreach (var candidate in candidates) { - var name = NullIfWhitespace(match.Groups[1].Value); - var level = NullIfWhitespace(match.Groups[2].Value); - if (name is null) continue; - languages.Add(new StructuredCvLanguage { Name = name, Level = level }); + var level = HumanLanguageCatalog.ExtractLevel(candidate); + if (level is null) continue; + + foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate)) + { + languages.Add(new StructuredCvLanguage { Name = name, Level = level }); + } } return languages @@ -872,6 +874,86 @@ public sealed class ProfileCvController : ControllerBase .ToList(); } + private async Task> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken) + { + var parsed = ParseSections(parseSource) + .Select(section => new StructuredCvSection + { + Name = section.Name, + Content = section.Content, + WordCount = CountWords(section.Content), + }) + .ToList(); + + var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase)); + if (hasRealSections) return parsed; + + var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken); + return aiSections.Count > 0 ? aiSections : parsed; + } + + private async Task> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken) + { + var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n") + .Select(block => block.Trim()) + .Where(block => block.Length >= 24) + .ToList(); + + if (blocks.Count == 0) return new List(); + + var sectionBuckets = new List(); + foreach (var block in blocks) + { + var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken); + var sectionName = classification?.Section; + if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical)) + { + sectionName = canonical; + } + + if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase)) + { + sectionName = "General"; + } + + var content = block; + if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null) + { + var lines = new List(); + if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}"); + var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase); + var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent); + var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value))); + if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta); + if (classification.Bullets is not null) + { + lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}")); + } + if (lines.Count > 0) content = string.Join("\n", lines); + } + + var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName); + if (existing is null) + { + sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) }); + } + else + { + existing.Content = $"{existing.Content}\n\n{content}".Trim(); + existing.WordCount = CountWords(existing.Content); + } + } + + return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList(); + } + + private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent) + { + if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null; + if (string.IsNullOrWhiteSpace(start)) return end; + return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}"; + } + private async Task MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken) { var normalized = text.Trim(); diff --git a/JobTrackerApi/Program.cs b/JobTrackerApi/Program.cs index b18037c..377f254 100644 --- a/JobTrackerApi/Program.cs +++ b/JobTrackerApi/Program.cs @@ -132,6 +132,7 @@ builder.Services.AddHttpClient("ai-service", client => builder.Services.AddMemoryCache(); builder.Services.AddSingleton(); +builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddScoped(); diff --git a/JobTrackerApi/Services/CvAiClassifier.cs b/JobTrackerApi/Services/CvAiClassifier.cs new file mode 100644 index 0000000..70aba34 --- /dev/null +++ b/JobTrackerApi/Services/CvAiClassifier.cs @@ -0,0 +1,65 @@ +using System.Net.Http; +using System.Text; +using System.Text.Json; + +namespace JobTrackerApi.Services; + +public sealed record CvBlockClassificationResult( + string? Section, + double? Confidence, + string? Reason, + string? Title, + string? Company, + string? Location, + string? Start, + string? End, + List? Bullets); + +public interface ICvAiClassifier +{ + Task ClassifyBlockAsync(string block, CancellationToken cancellationToken = default); +} + +public sealed class CvAiClassifier : ICvAiClassifier +{ + private readonly IHttpClientFactory _httpClientFactory; + + public CvAiClassifier(IHttpClientFactory httpClientFactory) + { + _httpClientFactory = httpClientFactory; + } + + public async Task ClassifyBlockAsync(string block, CancellationToken cancellationToken = default) + { + if (string.IsNullOrWhiteSpace(block)) return null; + + try + { + var client = _httpClientFactory.CreateClient("ai-service"); + var payload = JsonSerializer.Serialize(new { block }); + using var content = new StringContent(payload, Encoding.UTF8, "application/json"); + using var response = await client.PostAsync("/cv/classify-block", content, cancellationToken); + if (!response.IsSuccessStatusCode) return null; + + await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken); + var parsed = await JsonSerializer.DeserializeAsync(stream, new JsonSerializerOptions(JsonSerializerDefaults.Web) + { + PropertyNameCaseInsensitive = true + }, cancellationToken); + + return parsed; + } + catch + { + return null; + } + } +} + +public sealed class NoOpCvAiClassifier : ICvAiClassifier +{ + public static NoOpCvAiClassifier Instance { get; } = new(); + private NoOpCvAiClassifier() { } + public Task ClassifyBlockAsync(string block, CancellationToken cancellationToken = default) + => Task.FromResult(null); +} diff --git a/JobTrackerApi/Services/SummarizerService.cs b/JobTrackerApi/Services/SummarizerService.cs index e9294e5..6ebd044 100644 --- a/JobTrackerApi/Services/SummarizerService.cs +++ b/JobTrackerApi/Services/SummarizerService.cs @@ -21,6 +21,10 @@ namespace JobTrackerApi.Services string? GpuName, bool? OcrAvailable, string? OcrLanguages, + bool? OllamaConfigured, + bool? OllamaReachable, + string? OllamaModel, + bool? OllamaModelAvailable, double? HealthLatencyMs, double? ProbeLatencyMs, DateTimeOffset? LastProbeAt, @@ -310,6 +314,10 @@ namespace JobTrackerApi.Services string? gpuName = null; bool? ocrAvailable = null; string? ocrLanguages = null; + bool? ollamaConfigured = null; + bool? ollamaReachable = null; + string? ollamaModel = null; + bool? ollamaModelAvailable = null; double? healthLatencyMs = null; var healthy = false; string? healthError = null; @@ -332,6 +340,10 @@ namespace JobTrackerApi.Services if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString(); if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean(); if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString(); + if (doc.RootElement.TryGetProperty("ollama_configured", out var ollamaConfiguredEl) && ollamaConfiguredEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaConfigured = ollamaConfiguredEl.GetBoolean(); + if (doc.RootElement.TryGetProperty("ollama_reachable", out var ollamaReachableEl) && ollamaReachableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaReachable = ollamaReachableEl.GetBoolean(); + if (doc.RootElement.TryGetProperty("ollama_model", out var ollamaModelEl)) ollamaModel = ollamaModelEl.GetString(); + if (doc.RootElement.TryGetProperty("ollama_model_available", out var ollamaModelAvailableEl) && ollamaModelAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaModelAvailable = ollamaModelAvailableEl.GetBoolean(); } else { @@ -390,6 +402,10 @@ namespace JobTrackerApi.Services GpuName: gpuName, OcrAvailable: ocrAvailable, OcrLanguages: ocrLanguages, + OllamaConfigured: ollamaConfigured, + OllamaReachable: ollamaReachable, + OllamaModel: ollamaModel, + OllamaModelAvailable: ollamaModelAvailable, HealthLatencyMs: healthLatencyMs, ProbeLatencyMs: probeLatencyMs, LastProbeAt: lastProbeAt, diff --git a/Models/HumanLanguageCatalog.cs b/Models/HumanLanguageCatalog.cs new file mode 100644 index 0000000..6951aaa --- /dev/null +++ b/Models/HumanLanguageCatalog.cs @@ -0,0 +1,162 @@ +using System.Globalization; +using System.Text; +using System.Text.RegularExpressions; + +namespace JobTrackerApi.Models; + +public static class HumanLanguageCatalog +{ + private static readonly Dictionary LanguageLookup = BuildLanguageLookup(); + + private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled); + + private static readonly Regex LevelRegex = new( + @"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b", + RegexOptions.IgnoreCase | RegexOptions.Compiled); + + public static string? NormalizeLanguageName(string? raw) + { + var matches = ExtractLanguageNames(raw); + return matches.Count == 1 ? matches[0] : null; + } + + public static IReadOnlyList ExtractLanguageNames(string? raw) + { + if (string.IsNullOrWhiteSpace(raw)) return Array.Empty(); + + var words = WordRegex.Matches(raw) + .Select(match => match.Value) + .Where(value => !string.IsNullOrWhiteSpace(value)) + .ToList(); + + if (words.Count == 0) return Array.Empty(); + + var matches = new List<(int Start, int Size, string Canonical)>(); + for (var size = Math.Min(4, words.Count); size >= 1; size--) + { + for (var start = 0; start <= words.Count - size; start++) + { + var phrase = string.Join(" ", words.Skip(start).Take(size)); + if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue; + if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue; + matches.Add((start, size, canonical)); + } + } + + return matches + .OrderBy(match => match.Start) + .Select(match => match.Canonical) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + public static bool HasRecognizedLevel(string? raw) + { + return ExtractLevel(raw) is not null; + } + + public static string? ExtractLevel(string? raw) + { + if (string.IsNullOrWhiteSpace(raw)) return null; + + var match = LevelRegex.Match(raw); + if (!match.Success) return null; + + var value = match.Groups[1].Value.Trim(); + var compact = Regex.Replace(value, @"\s+", " "); + return compact.ToLowerInvariant() switch + { + "native speaker" => "Native", + "native" => "Native", + "fluent" => "Fluent", + "advanced" => "Advanced", + "intermediate" => "Intermediate", + "beginner" => "Beginner", + "basic" => "Basic", + "conversational" => "Conversational", + "elementary" => "Elementary", + "professional working proficiency" => "Professional working proficiency", + "working proficiency" => "Working proficiency", + "limited working proficiency" => "Limited working proficiency", + "full professional proficiency" => "Full professional proficiency", + _ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty), + _ => compact, + }; + } + + private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB) + { + var endA = startA + sizeA; + var endB = startB + sizeB; + return startA < endB && startB < endA; + } + + private static Dictionary BuildLanguageLookup() + { + var map = new Dictionary(StringComparer.OrdinalIgnoreCase); + + void Add(string? alias, string? canonical) + { + var normalizedAlias = NormalizeKey(alias); + var normalizedCanonical = NormalizeDisplayName(canonical); + if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return; + map.TryAdd(normalizedAlias, normalizedCanonical); + } + + foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures)) + { + var english = CleanCultureLanguageName(culture.EnglishName); + var native = CleanCultureLanguageName(culture.NativeName); + Add(english, english); + Add(native, english); + } + + Add("norsk", "Norwegian"); + Add("bokmal", "Norwegian"); + Add("bokmål", "Norwegian"); + Add("nynorsk", "Norwegian"); + Add("mandarin", "Chinese"); + Add("cantonese", "Chinese"); + Add("farsi", "Persian"); + Add("persian", "Persian"); + + return map; + } + + private static string? CleanCultureLanguageName(string? value) + { + if (string.IsNullOrWhiteSpace(value)) return null; + + var cleaned = value.Trim(); + var parenIndex = cleaned.IndexOf('('); + if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim(); + var commaIndex = cleaned.IndexOf(','); + if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim(); + return NormalizeDisplayName(cleaned); + } + + private static string? NormalizeDisplayName(string? value) + { + if (string.IsNullOrWhiteSpace(value)) return null; + var cleaned = Regex.Replace(value.Trim(), @"\s+", " "); + return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Select(word => word.Length <= 3 && word.All(char.IsUpper) + ? word + : char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())); + } + + private static string NormalizeKey(string? value) + { + if (string.IsNullOrWhiteSpace(value)) return string.Empty; + + var decomposed = value.Trim().Normalize(NormalizationForm.FormD); + var builder = new StringBuilder(decomposed.Length); + foreach (var ch in decomposed) + { + if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue; + builder.Append(char.ToLowerInvariant(ch)); + } + + return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim(); + } +} diff --git a/Models/StructuredCvProfileJson.cs b/Models/StructuredCvProfileJson.cs index ff43d3d..6903838 100644 --- a/Models/StructuredCvProfileJson.cs +++ b/Models/StructuredCvProfileJson.cs @@ -144,7 +144,7 @@ public static class StructuredCvProfileJson profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim(); profile.Metadata ??= new StructuredCvMetadata(); profile.Metadata.Fields ??= new Dictionary(); - profile.Contact ??= new StructuredCvContact(); + profile.Contact = NormalizeContact(profile.Contact); profile.Summary = CleanList(profile.Summary); profile.Jobs = (profile.Jobs ?? new List()) .Select(NormalizeJob) @@ -178,20 +178,206 @@ public static class StructuredCvProfileJson return profile; } + private static StructuredCvContact NormalizeContact(StructuredCvContact? contact) + { + contact ??= new StructuredCvContact(); + contact.FullName = TrimOrNull(contact.FullName); + contact.Headline = TrimOrNull(contact.Headline); + contact.Email = TrimOrNull(contact.Email); + contact.Phone = TrimOrNull(contact.Phone); + contact.Location = NormalizeLocationValue(contact.Location); + contact.Website = NormalizeWebsite(contact.Website); + contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn); + return contact; + } + private static StructuredCvJob NormalizeJob(StructuredCvJob? job) { job ??= new StructuredCvJob(); - job.Title = TrimOrNull(job.Title); - job.Company = TrimOrNull(job.Company); - job.Location = TrimOrNull(job.Location); - job.Start = TrimOrNull(job.Start); - job.End = TrimOrNull(job.End); - job.Bullets = CleanList(job.Bullets); + + var title = NormalizeJobTitle(job.Title); + var company = NormalizeCompanyName(job.Company); + var location = NormalizeLocationValue(job.Location); + + if (!string.IsNullOrWhiteSpace(title) && company is null) + { + var atSplit = Regex.Match(title, @"^(?.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase); + if (atSplit.Success) + { + title = NormalizeJobTitle(atSplit.Groups["title"].Value); + company = NormalizeCompanyName(atSplit.Groups["company"].Value); + } + } + + if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company)) + { + var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title); + var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company); + if (titleLooksLikeCompany && companyLooksLikeTitle) + { + (title, company) = (company, title); + } + } + + if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title)) + { + if (company is null) company = title; + title = null; + } + + if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null) + { + title = company; + company = null; + } + + job.Title = title; + job.Company = company; + job.Location = location; + job.Start = NormalizeDateValue(job.Start); + job.End = NormalizeDateValue(job.End); + job.Bullets = CleanList(job.Bullets) + .Select(NormalizeBullet) + .Where(bullet => bullet is not null) + .Select(bullet => bullet!) + .Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company)) + .ToList(); job.Skills = CleanList(job.Skills); job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase); return job; } + private static string? NormalizeBullet(string? value) + { + if (string.IsNullOrWhiteSpace(value)) return null; + return value.Trim().TrimStart('-', '•', '*', ' '); + } + + private static bool IsUsefulJobBullet(string? value, string? title, string? company) + { + var trimmed = TrimOrNull(value); + if (trimmed is null) return false; + if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false; + if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false; + if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false; + if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false; + return true; + } + + private static string? NormalizeJobTitle(string? value) + { + var trimmed = TrimOrNull(value); + if (trimmed is null) return null; + if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null; + trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':'); + return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed; + } + + private static string? NormalizeCompanyName(string? value) + { + var trimmed = TrimOrNull(value); + if (trimmed is null) return null; + if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null; + if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null; + if (trimmed.Contains('.') && trimmed.Contains(' ')) return null; + trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':'); + return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed; + } + + private static string? NormalizeLocationValue(string? value) + { + var trimmed = TrimOrNull(value); + if (trimmed is null) return null; + if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null; + if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null; + + var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':'); + var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); + if (parts.Length == 0 || parts.Length > 4) return null; + if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'’\-. ]+$"))) return null; + + return string.Join(", ", parts); + } + + private static string? NormalizeWebsite(string? value) + { + var trimmed = TrimOrNull(value); + if (trimmed is null) return null; + if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null; + + var candidate = trimmed; + if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}"; + if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null; + var host = uri.Host.Trim().Trim('.').ToLowerInvariant(); + if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null; + return host; + } + + private static string? NormalizeLinkedIn(string? value) + { + var trimmed = TrimOrNull(value); + if (trimmed is null) return null; + + var candidate = trimmed; + if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}"; + if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null; + if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null; + + var path = uri.AbsolutePath.TrimEnd('/'); + if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null; + return $"https://www.linkedin.com{path}"; + } + + private static string? NormalizeDateValue(string? value) + { + var trimmed = TrimOrNull(value); + return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null; + } + + private static bool LooksLikeDateRange(string value) + { + return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase); + } + + private static bool LooksLikeUrlOrEmail(string value) + { + return value.Contains('@') + || value.Contains("www.", StringComparison.OrdinalIgnoreCase) + || value.Contains("http://", StringComparison.OrdinalIgnoreCase) + || value.Contains("https://", StringComparison.OrdinalIgnoreCase); + } + + private static bool LooksLikeSectionHeading(string value) + { + return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase) + || value.Equals("Experience", StringComparison.OrdinalIgnoreCase) + || value.Equals("Employment History", StringComparison.OrdinalIgnoreCase) + || value.Equals("Education", StringComparison.OrdinalIgnoreCase) + || value.Equals("Skills", StringComparison.OrdinalIgnoreCase) + || value.Equals("Languages", StringComparison.OrdinalIgnoreCase) + || value.Equals("Interests", StringComparison.OrdinalIgnoreCase) + || value.Equals("Contact", StringComparison.OrdinalIgnoreCase) + || value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase) + || value.Equals("Summary", StringComparison.OrdinalIgnoreCase); + } + + private static bool LooksLikeJobTitle(string value) + { + if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false; + + return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase) + || (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value)); + } + + private static bool LooksLikeCompanyName(string value) + { + if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false; + + return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase) + || value.Contains('&') + || Regex.IsMatch(value, @"\b[A-Z]{2,}\b"); + } + private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education) { education ??= new StructuredCvEducation(); @@ -207,8 +393,13 @@ public static class StructuredCvProfileJson private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language) { language ??= new StructuredCvLanguage(); - language.Name = TrimOrNull(language.Name); - language.Level = TrimOrNull(language.Level); + + var originalName = TrimOrNull(language.Name); + var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName); + var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName); + + language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null; + language.Level = normalizedLevel; language.Notes = TrimOrNull(language.Notes); return language; } @@ -360,7 +551,13 @@ public static class StructuredCvProfileJson } } - return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes }; + var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item); + return new StructuredCvLanguage + { + Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null, + Level = normalizedLevel, + Notes = notes, + }; }) .Where(language => !string.IsNullOrWhiteSpace(language.Name)) .ToList(); diff --git a/README.md b/README.md index 970b73e..aaf1baf 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re ## Quickstart (Docker) -This runs: frontend (nginx), backend API, and the AI service. +This runs: frontend (nginx), backend API, the local AI service, and an Ollama container for hybrid CV block classification. 1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`). @@ -108,9 +108,15 @@ The API calls a local FastAPI service to generate summaries. If it’s not runni With Docker (recommended): ```bash -docker compose up --build ai-service +# One command for local Ollama startup + pull + AI-service restart +OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh + +# Then start the rest of the app if needed +docker compose up --build -d backend frontend ``` +The first Ollama startup is usually quick, but the first model pull and first generation can take a while. After the model is cached in the `ollama_data` volume, later restarts are much faster. + Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`). ## Configuration diff --git a/deploy/README.md b/deploy/README.md index 6fc78ba..f4ac278 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -52,6 +52,8 @@ AUTH_ADMIN_EMAIL=you@example.com AUTH_ADMIN_PASSWORD=replace_with_strong_password APP_PUBLIC_BASE_URL=https://your-domain.example AI_SERVICE_BASE_URL=http://ai-service:8001 +OLLAMA_BASE_URL=http://ollama:11434 +OLLAMA_MODEL=qwen2.5:7b EMAIL_FOLLOWUPREMINDERS_ENABLED=true EMAIL_FOLLOWUPREMINDERS_UPCOMINGDAYS=2 # Optional backward-compatible alias if older config still references the previous name: @@ -87,7 +89,8 @@ If this app is going to be a real production service on Ubuntu: 2. Gitea Actions runs tests 3. if green, workflow uploads repo to server 4. `deploy/deploy.sh` links `/opt/job-tracker/shared/.env` into the repo checkout, then runs `docker compose build && docker compose up -d` -5. workflow checks service status after deployment +5. if `OLLAMA_MODEL` is set, the deploy script waits for Ollama, pulls the configured model if missing, then restarts `ai-service` so hybrid CV classification can use it +6. workflow checks service status after deployment ## Post-deploy verification you should also do manually the first time - confirm reverse proxy routes to the frontend correctly @@ -96,3 +99,4 @@ If this app is going to be a real production service on Ubuntu: - confirm AI service container is reachable from backend - confirm reminder and admin/system pages load - verify follow-up reminder emails are enabled only when intended and that links open the correct job/tab +hat links open the correct job/tab diff --git a/deploy/deploy.sh b/deploy/deploy.sh index 2846236..495e440 100644 --- a/deploy/deploy.sh +++ b/deploy/deploy.sh @@ -45,6 +45,11 @@ build_with_recovery # Force recreation so updated port mappings, env vars, and container config always apply on deploy. compose up -d --force-recreate --remove-orphans +if [ -n "${OLLAMA_MODEL:-}" ]; then + echo "Post-deploy Ollama warmup enabled for model: ${OLLAMA_MODEL}" + ./scripts/start-ollama-cv.sh +fi + sleep 5 compose ps diff --git a/docker-compose.yml b/docker-compose.yml index 4336c73..9672408 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,8 +71,13 @@ services: build: context: ./tools/summarizer dockerfile: Dockerfile + environment: + - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://ollama:11434} + - OLLAMA_MODEL=${OLLAMA_MODEL:-qwen2.5:7b} ports: - "8001:8001" + depends_on: + - ollama networks: - default - shared_services @@ -83,8 +88,29 @@ services: timeout: 10s retries: 3 + ollama: + image: ollama/ollama:latest + ports: + - "11434:11434" + environment: + - OLLAMA_HOST=0.0.0.0:11434 + volumes: + - ollama_data:/root/.ollama + networks: + - default + - shared_services + restart: unless-stopped + gpus: all + healthcheck: + test: ["CMD", "ollama", "list"] + interval: 20s + timeout: 15s + retries: 10 + start_period: 20s + volumes: jobtracker_data: + ollama_data: networks: shared_services: diff --git a/job-tracker-ui/src/pages/ProfilePage.tsx b/job-tracker-ui/src/pages/ProfilePage.tsx index 0d755c2..d8c405f 100644 --- a/job-tracker-ui/src/pages/ProfilePage.tsx +++ b/job-tracker-ui/src/pages/ProfilePage.tsx @@ -1,8 +1,9 @@ import React, { useCallback, useEffect, useMemo, useRef, useState } from "react"; -import { Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material"; +import { Accordion, AccordionDetails, AccordionSummary, Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material"; import DeleteOutlineIcon from "@mui/icons-material/DeleteOutline"; +import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; import PhotoCameraOutlinedIcon from "@mui/icons-material/PhotoCameraOutlined"; import { api } from "../api"; @@ -399,22 +400,40 @@ export default function ProfilePage() { > {reprocessingCv ? t("profileCvReprocessing") : t("profileCvReprocess")} </Button> - <Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}> - {t("profileCopyCvText")} - </Button> </Box> </Box> {uploadingCv ? <LinearProgress sx={{ mb: 1.5 }} /> : null} - <TextField - label={t("profileCvTextLabel")} - value={profileCvText} - onChange={(e) => setProfileCvText(e.target.value)} - helperText={t("profileCvTextHelp")} - multiline - minRows={12} - disabled={!isLocal} - fullWidth - /> + <Alert severity="info" sx={{ mb: 2, borderRadius: 2.5 }}> + {t("profileCvStructuredDefaultHint")} + </Alert> + <Accordion disableGutters elevation={0} sx={{ mb: 2, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper", "&:before": { display: "none" } }}> + <AccordionSummary expandIcon={<ExpandMoreIcon />}> + <Box sx={{ display: "flex", justifyContent: "space-between", gap: 1.5, alignItems: "center", width: "100%", pr: 1 }}> + <Box> + <Typography variant="subtitle1" sx={{ fontWeight: 800 }}>{t("profileCvRawPanelTitle")}</Typography> + <Typography variant="body2" sx={{ color: "text.secondary" }}>{t("profileCvRawPanelHelp")}</Typography> + </Box> + <Chip size="small" label={t("profileCvSectionWordCount", { count: cvWordCount })} /> + </Box> + </AccordionSummary> + <AccordionDetails> + <TextField + label={t("profileCvTextLabel")} + value={profileCvText} + onChange={(e) => setProfileCvText(e.target.value)} + helperText={t("profileCvTextHelp")} + multiline + minRows={12} + disabled={!isLocal} + fullWidth + /> + <Box sx={{ mt: 1.5, display: "flex", justifyContent: "flex-end" }}> + <Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}> + {t("profileCopyCvText")} + </Button> + </Box> + </AccordionDetails> + </Accordion> <Box sx={{ mt: 2, p: 1.5, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper" }}> <Box sx={{ display: "flex", justifyContent: "space-between", gap: 2, flexWrap: "wrap", alignItems: "center", mb: 1.5 }}> <Box> diff --git a/job-tracker-ui/src/profile-page.test.tsx b/job-tracker-ui/src/profile-page.test.tsx index 94f322c..1f85c71 100644 --- a/job-tracker-ui/src/profile-page.test.tsx +++ b/job-tracker-ui/src/profile-page.test.tsx @@ -147,10 +147,17 @@ test('profile page loads persisted structured cv and can re-parse it', async () expect(screen.getByText(/extraction history/i)).toBeInTheDocument(); expect(screen.getByText(/resume.pdf/i)).toBeInTheDocument(); expect(screen.getByText(/current run/i)).toBeInTheDocument(); + expect(screen.getAllByText(/original extraction/i).length).toBeGreaterThan(0); + const originalExtractionToggle = screen.getByRole('button', { name: /original extraction/i }); + expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'false'); expect(screen.getAllByText(/professional summary/i).length).toBeGreaterThan(0); expect(screen.getByLabelText(/full name/i)).toHaveValue('Demo User'); expect(screen.getByText(/high 92%/i)).toBeInTheDocument(); + fireEvent.click(originalExtractionToggle); + expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'true'); + expect(await screen.findByLabelText(/profile cv \/ master resume text/i)).toHaveValue('Professional Summary\nBuilt backend systems'); + const analyzeButton = screen.getByRole('button', { name: /analyze sections/i }); await waitFor(() => expect(analyzeButton).toBeEnabled()); fireEvent.click(analyzeButton); diff --git a/scripts/start-ollama-cv.sh b/scripts/start-ollama-cv.sh new file mode 100755 index 0000000..93e5b32 --- /dev/null +++ b/scripts/start-ollama-cv.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/.." + +MODEL="${OLLAMA_MODEL:-qwen2.5:7b}" +OLLAMA_WAIT_SECONDS="${OLLAMA_WAIT_SECONDS:-180}" +PULL_WAIT_SECONDS="${OLLAMA_PULL_WAIT_SECONDS:-1800}" + +compose() { + docker compose "$@" +} + +wait_for_ollama() { + local deadline=$((SECONDS + OLLAMA_WAIT_SECONDS)) + while [ "$SECONDS" -lt "$deadline" ]; do + if compose exec -T ollama ollama list >/dev/null 2>&1; then + return 0 + fi + sleep 3 + done + return 1 +} + +model_present() { + compose exec -T ollama ollama list 2>/dev/null | awk 'NR>1 {print $1}' | grep -Fx "$MODEL" >/dev/null 2>&1 +} + +wait_for_model() { + local deadline=$((SECONDS + PULL_WAIT_SECONDS)) + while [ "$SECONDS" -lt "$deadline" ]; do + if model_present; then + return 0 + fi + sleep 5 + done + return 1 +} + +echo "Starting Ollama service..." +compose up -d ollama + +if ! wait_for_ollama; then + echo "Ollama did not become ready within ${OLLAMA_WAIT_SECONDS}s." + compose logs --tail=200 ollama || true + exit 1 +fi + +echo "Ollama is responding." + +if model_present; then + echo "Model already present: $MODEL" +else + echo "Pulling Ollama model: $MODEL" + compose exec -T ollama ollama pull "$MODEL" || { + echo "Model pull command failed." + compose logs --tail=200 ollama || true + exit 1 + } +fi + +if ! wait_for_model; then + echo "Model ${MODEL} did not appear within ${PULL_WAIT_SECONDS}s." + compose exec -T ollama ollama list || true + exit 1 +fi + +echo "Ollama model ready: $MODEL" + +echo "Restarting AI service so it can use the ready Ollama model." +compose up -d ai-service + +if ! compose ps ai-service --format '{{.State}}' 2>/dev/null | head -n 1 | tr '[:upper:]' '[:lower:]' | grep -qx 'running'; then + echo "AI service is not running after Ollama warmup." + compose logs --tail=200 ai-service || true + exit 1 +fi + +echo "Ollama warmup complete." diff --git a/tools/summarizer/README.md b/tools/summarizer/README.md index 8b26dc2..e4bf991 100644 --- a/tools/summarizer/README.md +++ b/tools/summarizer/README.md @@ -8,6 +8,7 @@ This service runs a local Hugging Face summarization model and also exposes docu - OCR fallback for scanned PDFs - OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`) - DOCX / TXT / MD extraction +- optional Ollama-backed CV block classification for harder sectioning ## Install @@ -36,8 +37,30 @@ The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can b - `GET /health` — health check and runtime capabilities - `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` - `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata +- `POST /cv/classify-block` — JSON body `{ "block": "..." }`, uses Ollama when `OLLAMA_MODEL` is configured -## Notes -- Model weights are downloaded on first run. +## Ollama +Set these before starting the service if you want the hybrid CV classifier enabled: + +```bash +export OLLAMA_BASE_URL=http://ollama:11434 +export OLLAMA_MODEL=qwen2.5:7b +``` + +Choose the model by setting `OLLAMA_MODEL` and then warming it with the helper script: + +```bash +OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh +``` + +Equivalent manual flow: + +```bash +docker compose up -d ollama +docker compose exec ollama ollama pull qwen2.5:7b +docker compose up -d ai-service +``` + +- Model weights are downloaded on first pull. - OCR quality depends on scan quality and language support. - Default OCR language is English (`eng`). diff --git a/tools/summarizer/app.py b/tools/summarizer/app.py index da0b264..a0447df 100644 --- a/tools/summarizer/app.py +++ b/tools/summarizer/app.py @@ -8,9 +8,13 @@ from docx import Document import fitz import hashlib import io +import json +import os import re import torch import pytesseract +from urllib import request as urllib_request +from urllib.error import URLError, HTTPError app = FastAPI(title="Local AI Service") @@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200 MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024 OCR_LANGUAGES = "eng" IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"} +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "") def _load_runtime(): @@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel): top_skills: int = Field(default=8, ge=3, le=12) +class CvClassifyBlockRequest(BaseModel): + block: str = Field(min_length=1, max_length=6000) + + def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str: h = hashlib.sha256(text.encode("utf-8")).hexdigest() return f"{h}:{max_length}:{min_length}:{top_skills}" +def _ollama_status(): + configured = bool(OLLAMA_MODEL) + if not configured: + return { + "ollama_configured": False, + "ollama_reachable": False, + "ollama_model": None, + "ollama_model_available": False, + } + + req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET") + try: + with urllib_request.urlopen(req, timeout=5) as response: + body = json.loads(response.read().decode("utf-8")) + except Exception: + return { + "ollama_configured": True, + "ollama_reachable": False, + "ollama_model": OLLAMA_MODEL, + "ollama_model_available": False, + } + + models = body.get("models") or [] + names = {item.get("name") for item in models if isinstance(item, dict)} + return { + "ollama_configured": True, + "ollama_reachable": True, + "ollama_model": OLLAMA_MODEL, + "ollama_model_available": OLLAMA_MODEL in names, + } + + @app.get("/health") async def health(): return { @@ -59,6 +101,7 @@ async def health(): "gpu_name": GPU_NAME, "ocr_available": True, "ocr_languages": OCR_LANGUAGES, + **_ollama_status(), } @@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str: return tokenizer.decode(outputs[0], skip_special_tokens=True).strip() +def _ollama_generate_json(prompt: str): + if not OLLAMA_MODEL: + raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.") + + payload = json.dumps({ + "model": OLLAMA_MODEL, + "prompt": prompt, + "stream": False, + "format": "json", + "options": {"temperature": 0.1} + }).encode("utf-8") + + req = urllib_request.Request( + f"{OLLAMA_BASE_URL}/api/generate", + data=payload, + headers={"Content-Type": "application/json"}, + method="POST", + ) + + try: + with urllib_request.urlopen(req, timeout=30) as response: + body = json.loads(response.read().decode("utf-8")) + except HTTPError as ex: + raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.") + except URLError as ex: + raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.") + + raw = (body.get("response") or "").strip() + if not raw: + raise HTTPException(status_code=502, detail="Ollama returned an empty response.") + + try: + return json.loads(raw) + except json.JSONDecodeError: + start = raw.find("{") + end = raw.rfind("}") + if start >= 0 and end > start: + return json.loads(raw[start:end + 1]) + raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.") + + +@app.post("/cv/classify-block") +async def classify_cv_block(req: CvClassifyBlockRequest): + prompt = f""" +You classify one CV text block into structured JSON. +Return ONLY valid JSON with this exact shape: +{{ + "section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other", + "confidence": 0.0, + "reason": "short reason", + "title": string|null, + "company": string|null, + "location": string|null, + "start": string|null, + "end": string|null, + "bullets": string[] +}} + +Rules: +- Preserve facts only. +- section must be one of the listed values. +- Use Work Experience only for job/employment blocks. +- For Contact blocks, keep title/company/start/end null and bullets empty. +- For non-work blocks, title/company/start/end should usually be null. +- location must look like a place, not a sentence. +- dates must be one of: year, month+year, dd/mm/yyyy, Present, Current. +- bullets should only be job tasks/achievements, not titles, companies, dates, or headings. +- If unsure, choose Other and keep fields null/empty. + +Block: +{req.block.strip()} +""".strip() + + parsed = _ollama_generate_json(prompt) + return { + "section": parsed.get("section") or "Other", + "confidence": parsed.get("confidence"), + "reason": parsed.get("reason"), + "title": parsed.get("title"), + "company": parsed.get("company"), + "location": parsed.get("location"), + "start": parsed.get("start"), + "end": parsed.get("end"), + "bullets": parsed.get("bullets") or [], + } + + @app.post("/summarize") async def summarize(req: SummarizeRequest): if req.min_length >= req.max_length: