Improve CV parsing and profile editor flow

2026-03-29 14:29:18 +02:00
parent 99fc94bc18
commit 44000f96f2
18 changed files with 1028 additions and 44 deletions
@@ -9,6 +9,9 @@ GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET
 # Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback
 GOOGLE_GMAIL_REDIRECT_URI=
 AI_SERVICE_BASE_URL=http://ai-service:8001
 # Optional: enables hybrid CV block classification in the local AI service.
 OLLAMA_BASE_URL=http://ollama:11434
 OLLAMA_MODEL=qwen2.5:7b
 # Optional: only needed if you want the UI to call a non-default API base URL.
 # In production the UI defaults to `/api`.
@@ -280,7 +280,7 @@ public sealed class ProfileCvControllerTests
    [Fact]
    public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid()
    {
-        var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1.";
+        var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1, C#, SQL, and public speaking.";
        var user = new ApplicationUser { Id = "user-1" };
        var userManager = CreateUserManager();
@@ -320,9 +320,164 @@ public sealed class ProfileCvControllerTests
        Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase));
        Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase));
        Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase));
        Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("C#", StringComparison.OrdinalIgnoreCase));
        Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("SQL", StringComparison.OrdinalIgnoreCase));
        Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Contains("public speaking", StringComparison.OrdinalIgnoreCase));
        Assert.DoesNotContain(structured.Sections, section => section.Name == "General");
    }
    [Fact]
    public void Structured_cv_normalization_keeps_human_languages_and_drops_skill_noise()
    {
        var structured = StructuredCvProfileJson.Deserialize("""
        {
          "version": "1",
          "contact": {},
          "summary": [],
          "jobs": [],
          "education": [],
          "skills": [],
          "languages": [
            { "name": "English", "level": "Native" },
            { "name": "Native Norwegian speaker", "level": null },
            { "name": "French", "level": null },
            { "name": "C#", "level": "Advanced" },
            { "name": "Leadership", "level": null }
          ],
          "interests": [],
          "otherSections": []
        }
        """);
        Assert.Collection(
            structured.Languages.OrderBy(item => item.Name, StringComparer.OrdinalIgnoreCase),
            first =>
            {
                Assert.Equal("English", first.Name);
                Assert.Equal("Native", first.Level);
            },
            second =>
            {
                Assert.Equal("Norwegian", second.Name);
                Assert.Equal("Native", second.Level);
            });
    }
    [Fact]
    public void Structured_cv_normalization_separates_job_title_company_and_tasks()
    {
        var structured = StructuredCvProfileJson.Deserialize("""
        {
          "version": "1",
          "contact": {},
          "summary": [],
          "jobs": [
            {
              "title": "Acme Ltd",
              "company": "Senior Backend Developer",
              "location": "Oslo",
              "start": "2022",
              "end": "2024",
              "isCurrent": false,
              "bullets": [
                "Senior Backend Developer",
                "Acme Ltd",
                "2022 - 2024",
                "Built API integrations for recruiter workflows and reduced manual follow-up churn."
              ],
              "skills": [".NET", "SQL"]
            },
            {
              "title": "Lead Engineer at Northwind Council",
              "company": null,
              "location": "Remote",
              "start": "2020",
              "end": "Present",
              "isCurrent": true,
              "bullets": [
                "Led platform delivery across case-management and reporting surfaces.",
                "Skills: C#, SQL"
              ],
              "skills": ["C#", "SQL"]
            }
          ],
          "education": [],
          "skills": [],
          "languages": [],
          "interests": [],
          "otherSections": []
        }
        """);
        Assert.Collection(
            structured.Jobs,
            first =>
            {
                Assert.Equal("Senior Backend Developer", first.Title);
                Assert.Equal("Acme Ltd", first.Company);
                Assert.Equal(new[] { "Built API integrations for recruiter workflows and reduced manual follow-up churn." }, first.Bullets);
            },
            second =>
            {
                Assert.Equal("Lead Engineer", second.Title);
                Assert.Equal("Northwind Council", second.Company);
                Assert.Equal(new[] { "Led platform delivery across case-management and reporting surfaces." }, second.Bullets);
            });
    }
    [Fact]
    public void Structured_cv_normalization_hardens_contact_links_locations_and_dates()
    {
        var structured = StructuredCvProfileJson.Deserialize("""
        {
          "version": "1",
          "contact": {
            "location": "Tønsberg, Norway",
            "website": "https://cesnimda.co.uk/about",
            "linkedin": "linkedin.com/in/demo-user?trk=foo"
          },
          "summary": [],
          "jobs": [
            {
              "title": "System Developer",
              "company": "Warwickshire County Council",
              "location": "Warwickshire, England, UK",
              "start": "Sept 2023",
              "end": "1/1/2024",
              "isCurrent": false,
              "bullets": ["Built APIs"],
              "skills": []
            },
            {
              "title": "Developer",
              "company": "Demo Co",
              "location": "Remote 123",
              "start": "Spring 2024",
              "end": "Later",
              "isCurrent": false,
              "bullets": ["Kept services running"],
              "skills": []
            }
          ],
          "education": [],
          "skills": [],
          "languages": [],
          "interests": [],
          "otherSections": []
        }
        """);
        Assert.Equal("Tønsberg, Norway", structured.Contact.Location);
        Assert.Equal("cesnimda.co.uk", structured.Contact.Website);
        Assert.Equal("https://www.linkedin.com/in/demo-user", structured.Contact.LinkedIn);
        Assert.Equal("Warwickshire, England, UK", structured.Jobs[0].Location);
        Assert.Equal("Sept 2023", structured.Jobs[0].Start);
        Assert.Equal("1/1/2024", structured.Jobs[0].End);
        Assert.Null(structured.Jobs[1].Location);
        Assert.Null(structured.Jobs[1].Start);
        Assert.Null(structured.Jobs[1].End);
    }
    [Fact]
    public async Task Parse_returns_structured_cv_and_persists_it()
    {
@@ -124,6 +124,10 @@ public sealed class AdminSystemController : ControllerBase
                GpuName: null,
                OcrAvailable: false,
                OcrLanguages: null,
                OllamaConfigured: null,
                OllamaReachable: null,
                OllamaModel: null,
                OllamaModelAvailable: null,
                HealthLatencyMs: null,
                ProbeLatencyMs: null,
                LastProbeAt: null,
@@ -61,13 +61,15 @@ public sealed class ProfileCvController : ControllerBase
    private readonly UserManager<ApplicationUser> _users;
    private readonly ISummarizerService _aiService;
    private readonly ICvAiClassifier _cvAiClassifier;
    private readonly JobTrackerContext _db;
    private readonly AppPaths _paths;
-    public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths)
+    public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null)
    {
        _users = users;
        _aiService = aiService;
        _cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance;
        _db = db;
        _paths = paths;
    }
@@ -338,14 +340,7 @@ public sealed class ProfileCvController : ControllerBase
    private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
    {
        var parseSource = NormalizeTextForStructuredParsing(text);
-        var fallbackSections = ParseSections(parseSource)
+        var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
            .Select(section => new StructuredCvSection
            {
                Name = section.Name,
                Content = section.Content,
                WordCount = CountWords(section.Content),
            })
            .ToList();
        var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
        AnnotateStructuredCv(sectionFallback, "repair", 0.56);
@@ -729,12 +724,19 @@ public sealed class ProfileCvController : ControllerBase
    private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
    {
        var languages = new List<StructuredCvLanguage>();
-        foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
+        var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+")
            .Select(item => item.Trim())
            .Where(item => item.Length > 1);
        foreach (var candidate in candidates)
        {
-            var name = NullIfWhitespace(match.Groups[1].Value);
+            var level = HumanLanguageCatalog.ExtractLevel(candidate);
-            var level = NullIfWhitespace(match.Groups[2].Value);
+            if (level is null) continue;
-            if (name is null) continue;
+
-            languages.Add(new StructuredCvLanguage { Name = name, Level = level });
+            foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate))
            {
                languages.Add(new StructuredCvLanguage { Name = name, Level = level });
            }
        }
        return languages
@@ -872,6 +874,86 @@ public sealed class ProfileCvController : ControllerBase
            .ToList();
    }
    private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
    {
        var parsed = ParseSections(parseSource)
            .Select(section => new StructuredCvSection
            {
                Name = section.Name,
                Content = section.Content,
                WordCount = CountWords(section.Content),
            })
            .ToList();
        var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
        if (hasRealSections) return parsed;
        var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
        return aiSections.Count > 0 ? aiSections : parsed;
    }
    private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
    {
        var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
            .Select(block => block.Trim())
            .Where(block => block.Length >= 24)
            .ToList();
        if (blocks.Count == 0) return new List<StructuredCvSection>();
        var sectionBuckets = new List<StructuredCvSection>();
        foreach (var block in blocks)
        {
            var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
            var sectionName = classification?.Section;
            if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
            {
                sectionName = canonical;
            }
            if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase))
            {
                sectionName = "General";
            }
            var content = block;
            if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null)
            {
                var lines = new List<string>();
                if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
                var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase);
                var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent);
                var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
                if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
                if (classification.Bullets is not null)
                {
                    lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
                }
                if (lines.Count > 0) content = string.Join("\n", lines);
            }
            var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
            if (existing is null)
            {
                sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
            }
            else
            {
                existing.Content = $"{existing.Content}\n\n{content}".Trim();
                existing.WordCount = CountWords(existing.Content);
            }
        }
        return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
    }
    private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
    {
        if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
        if (string.IsNullOrWhiteSpace(start)) return end;
        return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
    }
    private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
    {
        var normalized = text.Trim();
@@ -132,6 +132,7 @@ builder.Services.AddHttpClient("ai-service", client =>
 builder.Services.AddMemoryCache();
 builder.Services.AddSingleton<ISummarizerService, SummarizerService>();
 builder.Services.AddSingleton<ICvAiClassifier, CvAiClassifier>();
 builder.Services.AddSingleton<IGoogleTokenValidator, GoogleTokenValidator>();
 builder.Services.AddScoped<IGmailOAuthService, GmailOAuthService>();
@@ -0,0 +1,65 @@
 using System.Net.Http;
 using System.Text;
 using System.Text.Json;
 namespace JobTrackerApi.Services;
 public sealed record CvBlockClassificationResult(
    string? Section,
    double? Confidence,
    string? Reason,
    string? Title,
    string? Company,
    string? Location,
    string? Start,
    string? End,
    List<string>? Bullets);
 public interface ICvAiClassifier
 {
    Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default);
 }
 public sealed class CvAiClassifier : ICvAiClassifier
 {
    private readonly IHttpClientFactory _httpClientFactory;
    public CvAiClassifier(IHttpClientFactory httpClientFactory)
    {
        _httpClientFactory = httpClientFactory;
    }
    public async Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
    {
        if (string.IsNullOrWhiteSpace(block)) return null;
        try
        {
            var client = _httpClientFactory.CreateClient("ai-service");
            var payload = JsonSerializer.Serialize(new { block });
            using var content = new StringContent(payload, Encoding.UTF8, "application/json");
            using var response = await client.PostAsync("/cv/classify-block", content, cancellationToken);
            if (!response.IsSuccessStatusCode) return null;
            await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
            var parsed = await JsonSerializer.DeserializeAsync<CvBlockClassificationResult>(stream, new JsonSerializerOptions(JsonSerializerDefaults.Web)
            {
                PropertyNameCaseInsensitive = true
            }, cancellationToken);
            return parsed;
        }
        catch
        {
            return null;
        }
    }
 }
 public sealed class NoOpCvAiClassifier : ICvAiClassifier
 {
    public static NoOpCvAiClassifier Instance { get; } = new();
    private NoOpCvAiClassifier() { }
    public Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
        => Task.FromResult<CvBlockClassificationResult?>(null);
 }
@@ -21,6 +21,10 @@ namespace JobTrackerApi.Services
        string? GpuName,
        bool? OcrAvailable,
        string? OcrLanguages,
        bool? OllamaConfigured,
        bool? OllamaReachable,
        string? OllamaModel,
        bool? OllamaModelAvailable,
        double? HealthLatencyMs,
        double? ProbeLatencyMs,
        DateTimeOffset? LastProbeAt,
@@ -310,6 +314,10 @@ namespace JobTrackerApi.Services
            string? gpuName = null;
            bool? ocrAvailable = null;
            string? ocrLanguages = null;
            bool? ollamaConfigured = null;
            bool? ollamaReachable = null;
            string? ollamaModel = null;
            bool? ollamaModelAvailable = null;
            double? healthLatencyMs = null;
            var healthy = false;
            string? healthError = null;
@@ -332,6 +340,10 @@ namespace JobTrackerApi.Services
                    if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
                    if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
                    if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
                    if (doc.RootElement.TryGetProperty("ollama_configured", out var ollamaConfiguredEl) && ollamaConfiguredEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaConfigured = ollamaConfiguredEl.GetBoolean();
                    if (doc.RootElement.TryGetProperty("ollama_reachable", out var ollamaReachableEl) && ollamaReachableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaReachable = ollamaReachableEl.GetBoolean();
                    if (doc.RootElement.TryGetProperty("ollama_model", out var ollamaModelEl)) ollamaModel = ollamaModelEl.GetString();
                    if (doc.RootElement.TryGetProperty("ollama_model_available", out var ollamaModelAvailableEl) && ollamaModelAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaModelAvailable = ollamaModelAvailableEl.GetBoolean();
                }
                else
                {
@@ -390,6 +402,10 @@ namespace JobTrackerApi.Services
                GpuName: gpuName,
                OcrAvailable: ocrAvailable,
                OcrLanguages: ocrLanguages,
                OllamaConfigured: ollamaConfigured,
                OllamaReachable: ollamaReachable,
                OllamaModel: ollamaModel,
                OllamaModelAvailable: ollamaModelAvailable,
                HealthLatencyMs: healthLatencyMs,
                ProbeLatencyMs: probeLatencyMs,
                LastProbeAt: lastProbeAt,
@@ -0,0 +1,162 @@
 using System.Globalization;
 using System.Text;
 using System.Text.RegularExpressions;
 namespace JobTrackerApi.Models;
 public static class HumanLanguageCatalog
 {
    private static readonly Dictionary<string, string> LanguageLookup = BuildLanguageLookup();
    private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled);
    private static readonly Regex LevelRegex = new(
        @"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b",
        RegexOptions.IgnoreCase | RegexOptions.Compiled);
    public static string? NormalizeLanguageName(string? raw)
    {
        var matches = ExtractLanguageNames(raw);
        return matches.Count == 1 ? matches[0] : null;
    }
    public static IReadOnlyList<string> ExtractLanguageNames(string? raw)
    {
        if (string.IsNullOrWhiteSpace(raw)) return Array.Empty<string>();
        var words = WordRegex.Matches(raw)
            .Select(match => match.Value)
            .Where(value => !string.IsNullOrWhiteSpace(value))
            .ToList();
        if (words.Count == 0) return Array.Empty<string>();
        var matches = new List<(int Start, int Size, string Canonical)>();
        for (var size = Math.Min(4, words.Count); size >= 1; size--)
        {
            for (var start = 0; start <= words.Count - size; start++)
            {
                var phrase = string.Join(" ", words.Skip(start).Take(size));
                if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue;
                if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue;
                matches.Add((start, size, canonical));
            }
        }
        return matches
            .OrderBy(match => match.Start)
            .Select(match => match.Canonical)
            .Distinct(StringComparer.OrdinalIgnoreCase)
            .ToList();
    }
    public static bool HasRecognizedLevel(string? raw)
    {
        return ExtractLevel(raw) is not null;
    }
    public static string? ExtractLevel(string? raw)
    {
        if (string.IsNullOrWhiteSpace(raw)) return null;
        var match = LevelRegex.Match(raw);
        if (!match.Success) return null;
        var value = match.Groups[1].Value.Trim();
        var compact = Regex.Replace(value, @"\s+", " ");
        return compact.ToLowerInvariant() switch
        {
            "native speaker" => "Native",
            "native" => "Native",
            "fluent" => "Fluent",
            "advanced" => "Advanced",
            "intermediate" => "Intermediate",
            "beginner" => "Beginner",
            "basic" => "Basic",
            "conversational" => "Conversational",
            "elementary" => "Elementary",
            "professional working proficiency" => "Professional working proficiency",
            "working proficiency" => "Working proficiency",
            "limited working proficiency" => "Limited working proficiency",
            "full professional proficiency" => "Full professional proficiency",
            _ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty),
            _ => compact,
        };
    }
    private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB)
    {
        var endA = startA + sizeA;
        var endB = startB + sizeB;
        return startA < endB && startB < endA;
    }
    private static Dictionary<string, string> BuildLanguageLookup()
    {
        var map = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
        void Add(string? alias, string? canonical)
        {
            var normalizedAlias = NormalizeKey(alias);
            var normalizedCanonical = NormalizeDisplayName(canonical);
            if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return;
            map.TryAdd(normalizedAlias, normalizedCanonical);
        }
        foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures))
        {
            var english = CleanCultureLanguageName(culture.EnglishName);
            var native = CleanCultureLanguageName(culture.NativeName);
            Add(english, english);
            Add(native, english);
        }
        Add("norsk", "Norwegian");
        Add("bokmal", "Norwegian");
        Add("bokmål", "Norwegian");
        Add("nynorsk", "Norwegian");
        Add("mandarin", "Chinese");
        Add("cantonese", "Chinese");
        Add("farsi", "Persian");
        Add("persian", "Persian");
        return map;
    }
    private static string? CleanCultureLanguageName(string? value)
    {
        if (string.IsNullOrWhiteSpace(value)) return null;
        var cleaned = value.Trim();
        var parenIndex = cleaned.IndexOf('(');
        if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim();
        var commaIndex = cleaned.IndexOf(',');
        if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim();
        return NormalizeDisplayName(cleaned);
    }
    private static string? NormalizeDisplayName(string? value)
    {
        if (string.IsNullOrWhiteSpace(value)) return null;
        var cleaned = Regex.Replace(value.Trim(), @"\s+", " ");
        return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries)
            .Select(word => word.Length <= 3 && word.All(char.IsUpper)
                ? word
                : char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()));
    }
    private static string NormalizeKey(string? value)
    {
        if (string.IsNullOrWhiteSpace(value)) return string.Empty;
        var decomposed = value.Trim().Normalize(NormalizationForm.FormD);
        var builder = new StringBuilder(decomposed.Length);
        foreach (var ch in decomposed)
        {
            if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue;
            builder.Append(char.ToLowerInvariant(ch));
        }
        return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim();
    }
 }
@@ -144,7 +144,7 @@ public static class StructuredCvProfileJson
        profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
        profile.Metadata ??= new StructuredCvMetadata();
        profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
-        profile.Contact ??= new StructuredCvContact();
+        profile.Contact = NormalizeContact(profile.Contact);
        profile.Summary = CleanList(profile.Summary);
        profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
            .Select(NormalizeJob)
@@ -178,20 +178,206 @@ public static class StructuredCvProfileJson
        return profile;
    }
    private static StructuredCvContact NormalizeContact(StructuredCvContact? contact)
    {
        contact ??= new StructuredCvContact();
        contact.FullName = TrimOrNull(contact.FullName);
        contact.Headline = TrimOrNull(contact.Headline);
        contact.Email = TrimOrNull(contact.Email);
        contact.Phone = TrimOrNull(contact.Phone);
        contact.Location = NormalizeLocationValue(contact.Location);
        contact.Website = NormalizeWebsite(contact.Website);
        contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn);
        return contact;
    }
    private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
    {
        job ??= new StructuredCvJob();
-        job.Title = TrimOrNull(job.Title);
+
-        job.Company = TrimOrNull(job.Company);
+        var title = NormalizeJobTitle(job.Title);
-        job.Location = TrimOrNull(job.Location);
+        var company = NormalizeCompanyName(job.Company);
-        job.Start = TrimOrNull(job.Start);
+        var location = NormalizeLocationValue(job.Location);
-        job.End = TrimOrNull(job.End);
+
-        job.Bullets = CleanList(job.Bullets);
+        if (!string.IsNullOrWhiteSpace(title) && company is null)
        {
            var atSplit = Regex.Match(title, @"^(?<title>.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase);
            if (atSplit.Success)
            {
                title = NormalizeJobTitle(atSplit.Groups["title"].Value);
                company = NormalizeCompanyName(atSplit.Groups["company"].Value);
            }
        }
        if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company))
        {
            var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title);
            var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company);
            if (titleLooksLikeCompany && companyLooksLikeTitle)
            {
                (title, company) = (company, title);
            }
        }
        if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title))
        {
            if (company is null) company = title;
            title = null;
        }
        if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null)
        {
            title = company;
            company = null;
        }
        job.Title = title;
        job.Company = company;
        job.Location = location;
        job.Start = NormalizeDateValue(job.Start);
        job.End = NormalizeDateValue(job.End);
        job.Bullets = CleanList(job.Bullets)
            .Select(NormalizeBullet)
            .Where(bullet => bullet is not null)
            .Select(bullet => bullet!)
            .Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company))
            .ToList();
        job.Skills = CleanList(job.Skills);
        job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
        return job;
    }
    private static string? NormalizeBullet(string? value)
    {
        if (string.IsNullOrWhiteSpace(value)) return null;
        return value.Trim().TrimStart('-', '•', '*', ' ');
    }
    private static bool IsUsefulJobBullet(string? value, string? title, string? company)
    {
        var trimmed = TrimOrNull(value);
        if (trimmed is null) return false;
        if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false;
        if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false;
        if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false;
        if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false;
        return true;
    }
    private static string? NormalizeJobTitle(string? value)
    {
        var trimmed = TrimOrNull(value);
        if (trimmed is null) return null;
        if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
        trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
        return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
    }
    private static string? NormalizeCompanyName(string? value)
    {
        var trimmed = TrimOrNull(value);
        if (trimmed is null) return null;
        if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
        if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null;
        if (trimmed.Contains('.') && trimmed.Contains(' ')) return null;
        trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
        return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
    }
    private static string? NormalizeLocationValue(string? value)
    {
        var trimmed = TrimOrNull(value);
        if (trimmed is null) return null;
        if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
        if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
        var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
        var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
        if (parts.Length == 0 || parts.Length > 4) return null;
        if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'’\-. ]+$"))) return null;
        return string.Join(", ", parts);
    }
    private static string? NormalizeWebsite(string? value)
    {
        var trimmed = TrimOrNull(value);
        if (trimmed is null) return null;
        if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
        var candidate = trimmed;
        if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
        if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
        var host = uri.Host.Trim().Trim('.').ToLowerInvariant();
        if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null;
        return host;
    }
    private static string? NormalizeLinkedIn(string? value)
    {
        var trimmed = TrimOrNull(value);
        if (trimmed is null) return null;
        var candidate = trimmed;
        if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
        if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
        if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
        var path = uri.AbsolutePath.TrimEnd('/');
        if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null;
        return $"https://www.linkedin.com{path}";
    }
    private static string? NormalizeDateValue(string? value)
    {
        var trimmed = TrimOrNull(value);
        return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null;
    }
    private static bool LooksLikeDateRange(string value)
    {
        return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
    }
    private static bool LooksLikeUrlOrEmail(string value)
    {
        return value.Contains('@')
            || value.Contains("www.", StringComparison.OrdinalIgnoreCase)
            || value.Contains("http://", StringComparison.OrdinalIgnoreCase)
            || value.Contains("https://", StringComparison.OrdinalIgnoreCase);
    }
    private static bool LooksLikeSectionHeading(string value)
    {
        return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Experience", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Employment History", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Education", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Skills", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Languages", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Interests", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Contact", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase)
            || value.Equals("Summary", StringComparison.OrdinalIgnoreCase);
    }
    private static bool LooksLikeJobTitle(string value)
    {
        if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
        return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase)
            || (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value));
    }
    private static bool LooksLikeCompanyName(string value)
    {
        if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
        return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase)
            || value.Contains('&')
            || Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
    }
    private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
    {
        education ??= new StructuredCvEducation();
@@ -207,8 +393,13 @@ public static class StructuredCvProfileJson
    private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
    {
        language ??= new StructuredCvLanguage();
-        language.Name = TrimOrNull(language.Name);
+
-        language.Level = TrimOrNull(language.Level);
+        var originalName = TrimOrNull(language.Name);
        var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName);
        var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName);
        language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null;
        language.Level = normalizedLevel;
        language.Notes = TrimOrNull(language.Notes);
        return language;
    }
@@ -360,7 +551,13 @@ public static class StructuredCvProfileJson
                    }
                }
-                return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes };
+                var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
                return new StructuredCvLanguage
                {
                    Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
                    Level = normalizedLevel,
                    Notes = notes,
                };
            })
            .Where(language => !string.IsNullOrWhiteSpace(language.Name))
            .ToList();
@@ -25,7 +25,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
 ## Quickstart (Docker)
-This runs: frontend (nginx), backend API, and the AI service.
+This runs: frontend (nginx), backend API, the local AI service, and an Ollama container for hybrid CV block classification.
 1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`).
@@ -108,9 +108,15 @@ The API calls a local FastAPI service to generate summaries. If it’s not runni
 With Docker (recommended):
 ```bash
-docker compose up --build ai-service
+# One command for local Ollama startup + pull + AI-service restart
 OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
 # Then start the rest of the app if needed
 docker compose up --build -d backend frontend
 ```
 The first Ollama startup is usually quick, but the first model pull and first generation can take a while. After the model is cached in the `ollama_data` volume, later restarts are much faster.
 Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`).
 ## Configuration
@@ -52,6 +52,8 @@ AUTH_ADMIN_EMAIL=you@example.com
 AUTH_ADMIN_PASSWORD=replace_with_strong_password
 APP_PUBLIC_BASE_URL=https://your-domain.example
 AI_SERVICE_BASE_URL=http://ai-service:8001
 OLLAMA_BASE_URL=http://ollama:11434
 OLLAMA_MODEL=qwen2.5:7b
 EMAIL_FOLLOWUPREMINDERS_ENABLED=true
 EMAIL_FOLLOWUPREMINDERS_UPCOMINGDAYS=2
 # Optional backward-compatible alias if older config still references the previous name:
@@ -87,7 +89,8 @@ If this app is going to be a real production service on Ubuntu:
 2. Gitea Actions runs tests
 3. if green, workflow uploads repo to server
 4. `deploy/deploy.sh` links `/opt/job-tracker/shared/.env` into the repo checkout, then runs `docker compose build && docker compose up -d`
-5. workflow checks service status after deployment
+5. if `OLLAMA_MODEL` is set, the deploy script waits for Ollama, pulls the configured model if missing, then restarts `ai-service` so hybrid CV classification can use it
 6. workflow checks service status after deployment
 ## Post-deploy verification you should also do manually the first time
 - confirm reverse proxy routes to the frontend correctly
@@ -96,3 +99,4 @@ If this app is going to be a real production service on Ubuntu:
 - confirm AI service container is reachable from backend
 - confirm reminder and admin/system pages load
 - verify follow-up reminder emails are enabled only when intended and that links open the correct job/tab
 hat links open the correct job/tab
@@ -45,6 +45,11 @@ build_with_recovery
 # Force recreation so updated port mappings, env vars, and container config always apply on deploy.
 compose up -d --force-recreate --remove-orphans
 if [ -n "${OLLAMA_MODEL:-}" ]; then
  echo "Post-deploy Ollama warmup enabled for model: ${OLLAMA_MODEL}"
  ./scripts/start-ollama-cv.sh
 fi
 sleep 5
 compose ps
@@ -71,8 +71,13 @@ services:
    build:
      context: ./tools/summarizer
      dockerfile: Dockerfile
    environment:
      - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://ollama:11434}
      - OLLAMA_MODEL=${OLLAMA_MODEL:-qwen2.5:7b}
    ports:
      - "8001:8001"
    depends_on:
      - ollama
    networks:
      - default
      - shared_services
@@ -83,8 +88,29 @@ services:
      timeout: 10s
      retries: 3
  ollama:
    image: ollama/ollama:latest
    ports:
      - "11434:11434"
    environment:
      - OLLAMA_HOST=0.0.0.0:11434
    volumes:
      - ollama_data:/root/.ollama
    networks:
      - default
      - shared_services
    restart: unless-stopped
    gpus: all
    healthcheck:
      test: ["CMD", "ollama", "list"]
      interval: 20s
      timeout: 15s
      retries: 10
      start_period: 20s
 volumes:
  jobtracker_data:
  ollama_data:
 networks:
  shared_services:
@@ -1,8 +1,9 @@
 import React, { useCallback, useEffect, useMemo, useRef, useState } from "react";
-import { Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material";
+import { Accordion, AccordionDetails, AccordionSummary, Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material";
 import DeleteOutlineIcon from "@mui/icons-material/DeleteOutline";
 import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
 import PhotoCameraOutlinedIcon from "@mui/icons-material/PhotoCameraOutlined";
 import { api } from "../api";
@@ -399,22 +400,40 @@ export default function ProfilePage() {
              >
                {reprocessingCv ? t("profileCvReprocessing") : t("profileCvReprocess")}
              </Button>
              <Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
                {t("profileCopyCvText")}
              </Button>
            </Box>
          </Box>
          {uploadingCv ? <LinearProgress sx={{ mb: 1.5 }} /> : null}
-          <TextField
+          <Alert severity="info" sx={{ mb: 2, borderRadius: 2.5 }}>
-            label={t("profileCvTextLabel")}
+            {t("profileCvStructuredDefaultHint")}
-            value={profileCvText}
+          </Alert>
-            onChange={(e) => setProfileCvText(e.target.value)}
+          <Accordion disableGutters elevation={0} sx={{ mb: 2, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper", "&:before": { display: "none" } }}>
-            helperText={t("profileCvTextHelp")}
+            <AccordionSummary expandIcon={<ExpandMoreIcon />}>
-            multiline
+              <Box sx={{ display: "flex", justifyContent: "space-between", gap: 1.5, alignItems: "center", width: "100%", pr: 1 }}>
-            minRows={12}
+                <Box>
-            disabled={!isLocal}
+                  <Typography variant="subtitle1" sx={{ fontWeight: 800 }}>{t("profileCvRawPanelTitle")}</Typography>
-            fullWidth
+                  <Typography variant="body2" sx={{ color: "text.secondary" }}>{t("profileCvRawPanelHelp")}</Typography>
-          />
+                </Box>
                <Chip size="small" label={t("profileCvSectionWordCount", { count: cvWordCount })} />
              </Box>
            </AccordionSummary>
            <AccordionDetails>
              <TextField
                label={t("profileCvTextLabel")}
                value={profileCvText}
                onChange={(e) => setProfileCvText(e.target.value)}
                helperText={t("profileCvTextHelp")}
                multiline
                minRows={12}
                disabled={!isLocal}
                fullWidth
              />
              <Box sx={{ mt: 1.5, display: "flex", justifyContent: "flex-end" }}>
                <Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
                  {t("profileCopyCvText")}
                </Button>
              </Box>
            </AccordionDetails>
          </Accordion>
          <Box sx={{ mt: 2, p: 1.5, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper" }}>
            <Box sx={{ display: "flex", justifyContent: "space-between", gap: 2, flexWrap: "wrap", alignItems: "center", mb: 1.5 }}>
              <Box>
@@ -147,10 +147,17 @@ test('profile page loads persisted structured cv and can re-parse it', async ()
  expect(screen.getByText(/extraction history/i)).toBeInTheDocument();
  expect(screen.getByText(/resume.pdf/i)).toBeInTheDocument();
  expect(screen.getByText(/current run/i)).toBeInTheDocument();
  expect(screen.getAllByText(/original extraction/i).length).toBeGreaterThan(0);
  const originalExtractionToggle = screen.getByRole('button', { name: /original extraction/i });
  expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'false');
  expect(screen.getAllByText(/professional summary/i).length).toBeGreaterThan(0);
  expect(screen.getByLabelText(/full name/i)).toHaveValue('Demo User');
  expect(screen.getByText(/high 92%/i)).toBeInTheDocument();
  fireEvent.click(originalExtractionToggle);
  expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'true');
  expect(await screen.findByLabelText(/profile cv \/ master resume text/i)).toHaveValue('Professional Summary\nBuilt backend systems');
  const analyzeButton = screen.getByRole('button', { name: /analyze sections/i });
  await waitFor(() => expect(analyzeButton).toBeEnabled());
  fireEvent.click(analyzeButton);
@@ -0,0 +1,79 @@
 #!/usr/bin/env bash
 set -euo pipefail
 cd "$(dirname "$0")/.."
 MODEL="${OLLAMA_MODEL:-qwen2.5:7b}"
 OLLAMA_WAIT_SECONDS="${OLLAMA_WAIT_SECONDS:-180}"
 PULL_WAIT_SECONDS="${OLLAMA_PULL_WAIT_SECONDS:-1800}"
 compose() {
  docker compose "$@"
 }
 wait_for_ollama() {
  local deadline=$((SECONDS + OLLAMA_WAIT_SECONDS))
  while [ "$SECONDS" -lt "$deadline" ]; do
    if compose exec -T ollama ollama list >/dev/null 2>&1; then
      return 0
    fi
    sleep 3
  done
  return 1
 }
 model_present() {
  compose exec -T ollama ollama list 2>/dev/null | awk 'NR>1 {print $1}' | grep -Fx "$MODEL" >/dev/null 2>&1
 }
 wait_for_model() {
  local deadline=$((SECONDS + PULL_WAIT_SECONDS))
  while [ "$SECONDS" -lt "$deadline" ]; do
    if model_present; then
      return 0
    fi
    sleep 5
  done
  return 1
 }
 echo "Starting Ollama service..."
 compose up -d ollama
 if ! wait_for_ollama; then
  echo "Ollama did not become ready within ${OLLAMA_WAIT_SECONDS}s."
  compose logs --tail=200 ollama || true
  exit 1
 fi
 echo "Ollama is responding."
 if model_present; then
  echo "Model already present: $MODEL"
 else
  echo "Pulling Ollama model: $MODEL"
  compose exec -T ollama ollama pull "$MODEL" || {
    echo "Model pull command failed."
    compose logs --tail=200 ollama || true
    exit 1
  }
 fi
 if ! wait_for_model; then
  echo "Model ${MODEL} did not appear within ${PULL_WAIT_SECONDS}s."
  compose exec -T ollama ollama list || true
  exit 1
 fi
 echo "Ollama model ready: $MODEL"
 echo "Restarting AI service so it can use the ready Ollama model."
 compose up -d ai-service
 if ! compose ps ai-service --format '{{.State}}' 2>/dev/null | head -n 1 | tr '[:upper:]' '[:lower:]' | grep -qx 'running'; then
  echo "AI service is not running after Ollama warmup."
  compose logs --tail=200 ai-service || true
  exit 1
 fi
 echo "Ollama warmup complete."
@@ -8,6 +8,7 @@ This service runs a local Hugging Face summarization model and also exposes docu
 - OCR fallback for scanned PDFs
 - OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
 - DOCX / TXT / MD extraction
 - optional Ollama-backed CV block classification for harder sectioning
 ## Install
@@ -36,8 +37,30 @@ The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can b
 - `GET /health` — health check and runtime capabilities
 - `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
 - `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
 - `POST /cv/classify-block` — JSON body `{ "block": "..." }`, uses Ollama when `OLLAMA_MODEL` is configured
-## Notes
+## Ollama
- Model weights are downloaded on first run.
+Set these before starting the service if you want the hybrid CV classifier enabled:
 ```bash
 export OLLAMA_BASE_URL=http://ollama:11434
 export OLLAMA_MODEL=qwen2.5:7b
 ```
 Choose the model by setting `OLLAMA_MODEL` and then warming it with the helper script:
 ```bash
 OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
 ```
 Equivalent manual flow:
 ```bash
 docker compose up -d ollama
 docker compose exec ollama ollama pull qwen2.5:7b
 docker compose up -d ai-service
 ```
 - Model weights are downloaded on first pull.
 - OCR quality depends on scan quality and language support.
 - Default OCR language is English (`eng`).
@@ -8,9 +8,13 @@ from docx import Document
 import fitz
 import hashlib
 import io
 import json
 import os
 import re
 import torch
 import pytesseract
 from urllib import request as urllib_request
 from urllib.error import URLError, HTTPError
 app = FastAPI(title="Local AI Service")
@@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200
 MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
 OCR_LANGUAGES = "eng"
 IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
 OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
 OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
 def _load_runtime():
@@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel):
    top_skills: int = Field(default=8, ge=3, le=12)
 class CvClassifyBlockRequest(BaseModel):
    block: str = Field(min_length=1, max_length=6000)
 def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
    return f"{h}:{max_length}:{min_length}:{top_skills}"
 def _ollama_status():
    configured = bool(OLLAMA_MODEL)
    if not configured:
        return {
            "ollama_configured": False,
            "ollama_reachable": False,
            "ollama_model": None,
            "ollama_model_available": False,
        }
    req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET")
    try:
        with urllib_request.urlopen(req, timeout=5) as response:
            body = json.loads(response.read().decode("utf-8"))
    except Exception:
        return {
            "ollama_configured": True,
            "ollama_reachable": False,
            "ollama_model": OLLAMA_MODEL,
            "ollama_model_available": False,
        }
    models = body.get("models") or []
    names = {item.get("name") for item in models if isinstance(item, dict)}
    return {
        "ollama_configured": True,
        "ollama_reachable": True,
        "ollama_model": OLLAMA_MODEL,
        "ollama_model_available": OLLAMA_MODEL in names,
    }
@app.get("/health")
 async def health():
    return {
@@ -59,6 +101,7 @@ async def health():
        "gpu_name": GPU_NAME,
        "ocr_available": True,
        "ocr_languages": OCR_LANGUAGES,
        **_ollama_status(),
    }
@@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
 def _ollama_generate_json(prompt: str):
    if not OLLAMA_MODEL:
        raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.")
    payload = json.dumps({
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "stream": False,
        "format": "json",
        "options": {"temperature": 0.1}
    }).encode("utf-8")
    req = urllib_request.Request(
        f"{OLLAMA_BASE_URL}/api/generate",
        data=payload,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    try:
        with urllib_request.urlopen(req, timeout=30) as response:
            body = json.loads(response.read().decode("utf-8"))
    except HTTPError as ex:
        raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
    except URLError as ex:
        raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.")
    raw = (body.get("response") or "").strip()
    if not raw:
        raise HTTPException(status_code=502, detail="Ollama returned an empty response.")
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        start = raw.find("{")
        end = raw.rfind("}")
        if start >= 0 and end > start:
            return json.loads(raw[start:end + 1])
        raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")
@app.post("/cv/classify-block")
 async def classify_cv_block(req: CvClassifyBlockRequest):
    prompt = f"""
 You classify one CV text block into structured JSON.
 Return ONLY valid JSON with this exact shape:
 {{
  "section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other",
  "confidence": 0.0,
  "reason": "short reason",
  "title": string|null,
  "company": string|null,
  "location": string|null,
  "start": string|null,
  "end": string|null,
  "bullets": string[]
 }}
 Rules:
 - Preserve facts only.
 - section must be one of the listed values.
 - Use Work Experience only for job/employment blocks.
 - For Contact blocks, keep title/company/start/end null and bullets empty.
 - For non-work blocks, title/company/start/end should usually be null.
 - location must look like a place, not a sentence.
 - dates must be one of: year, month+year, dd/mm/yyyy, Present, Current.
 - bullets should only be job tasks/achievements, not titles, companies, dates, or headings.
 - If unsure, choose Other and keep fields null/empty.
 Block:
 {req.block.strip()}
 """.strip()
    parsed = _ollama_generate_json(prompt)
    return {
        "section": parsed.get("section") or "Other",
        "confidence": parsed.get("confidence"),
        "reason": parsed.get("reason"),
        "title": parsed.get("title"),
        "company": parsed.get("company"),
        "location": parsed.get("location"),
        "start": parsed.get("start"),
        "end": parsed.get("end"),
        "bullets": parsed.get("bullets") or [],
    }
@app.post("/summarize")
 async def summarize(req: SummarizeRequest):
    if req.min_length >= req.max_length: