Improve CV parsing and profile editor flow

This commit is contained in:
2026-03-29 14:29:18 +02:00
parent 99fc94bc18
commit 44000f96f2
18 changed files with 1028 additions and 44 deletions
+3
View File
@@ -9,6 +9,9 @@ GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET
# Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback # Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback
GOOGLE_GMAIL_REDIRECT_URI= GOOGLE_GMAIL_REDIRECT_URI=
AI_SERVICE_BASE_URL=http://ai-service:8001 AI_SERVICE_BASE_URL=http://ai-service:8001
# Optional: enables hybrid CV block classification in the local AI service.
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_MODEL=qwen2.5:7b
# Optional: only needed if you want the UI to call a non-default API base URL. # Optional: only needed if you want the UI to call a non-default API base URL.
# In production the UI defaults to `/api`. # In production the UI defaults to `/api`.
+156 -1
View File
@@ -280,7 +280,7 @@ public sealed class ProfileCvControllerTests
[Fact] [Fact]
public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid() public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid()
{ {
var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1."; var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1, C#, SQL, and public speaking.";
var user = new ApplicationUser { Id = "user-1" }; var user = new ApplicationUser { Id = "user-1" };
var userManager = CreateUserManager(); var userManager = CreateUserManager();
@@ -320,9 +320,164 @@ public sealed class ProfileCvControllerTests
Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase)); Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase)); Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase)); Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("C#", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("SQL", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Contains("public speaking", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Sections, section => section.Name == "General"); Assert.DoesNotContain(structured.Sections, section => section.Name == "General");
} }
[Fact]
public void Structured_cv_normalization_keeps_human_languages_and_drops_skill_noise()
{
var structured = StructuredCvProfileJson.Deserialize("""
{
"version": "1",
"contact": {},
"summary": [],
"jobs": [],
"education": [],
"skills": [],
"languages": [
{ "name": "English", "level": "Native" },
{ "name": "Native Norwegian speaker", "level": null },
{ "name": "French", "level": null },
{ "name": "C#", "level": "Advanced" },
{ "name": "Leadership", "level": null }
],
"interests": [],
"otherSections": []
}
""");
Assert.Collection(
structured.Languages.OrderBy(item => item.Name, StringComparer.OrdinalIgnoreCase),
first =>
{
Assert.Equal("English", first.Name);
Assert.Equal("Native", first.Level);
},
second =>
{
Assert.Equal("Norwegian", second.Name);
Assert.Equal("Native", second.Level);
});
}
[Fact]
public void Structured_cv_normalization_separates_job_title_company_and_tasks()
{
var structured = StructuredCvProfileJson.Deserialize("""
{
"version": "1",
"contact": {},
"summary": [],
"jobs": [
{
"title": "Acme Ltd",
"company": "Senior Backend Developer",
"location": "Oslo",
"start": "2022",
"end": "2024",
"isCurrent": false,
"bullets": [
"Senior Backend Developer",
"Acme Ltd",
"2022 - 2024",
"Built API integrations for recruiter workflows and reduced manual follow-up churn."
],
"skills": [".NET", "SQL"]
},
{
"title": "Lead Engineer at Northwind Council",
"company": null,
"location": "Remote",
"start": "2020",
"end": "Present",
"isCurrent": true,
"bullets": [
"Led platform delivery across case-management and reporting surfaces.",
"Skills: C#, SQL"
],
"skills": ["C#", "SQL"]
}
],
"education": [],
"skills": [],
"languages": [],
"interests": [],
"otherSections": []
}
""");
Assert.Collection(
structured.Jobs,
first =>
{
Assert.Equal("Senior Backend Developer", first.Title);
Assert.Equal("Acme Ltd", first.Company);
Assert.Equal(new[] { "Built API integrations for recruiter workflows and reduced manual follow-up churn." }, first.Bullets);
},
second =>
{
Assert.Equal("Lead Engineer", second.Title);
Assert.Equal("Northwind Council", second.Company);
Assert.Equal(new[] { "Led platform delivery across case-management and reporting surfaces." }, second.Bullets);
});
}
[Fact]
public void Structured_cv_normalization_hardens_contact_links_locations_and_dates()
{
var structured = StructuredCvProfileJson.Deserialize("""
{
"version": "1",
"contact": {
"location": "Tønsberg, Norway",
"website": "https://cesnimda.co.uk/about",
"linkedin": "linkedin.com/in/demo-user?trk=foo"
},
"summary": [],
"jobs": [
{
"title": "System Developer",
"company": "Warwickshire County Council",
"location": "Warwickshire, England, UK",
"start": "Sept 2023",
"end": "1/1/2024",
"isCurrent": false,
"bullets": ["Built APIs"],
"skills": []
},
{
"title": "Developer",
"company": "Demo Co",
"location": "Remote 123",
"start": "Spring 2024",
"end": "Later",
"isCurrent": false,
"bullets": ["Kept services running"],
"skills": []
}
],
"education": [],
"skills": [],
"languages": [],
"interests": [],
"otherSections": []
}
""");
Assert.Equal("Tønsberg, Norway", structured.Contact.Location);
Assert.Equal("cesnimda.co.uk", structured.Contact.Website);
Assert.Equal("https://www.linkedin.com/in/demo-user", structured.Contact.LinkedIn);
Assert.Equal("Warwickshire, England, UK", structured.Jobs[0].Location);
Assert.Equal("Sept 2023", structured.Jobs[0].Start);
Assert.Equal("1/1/2024", structured.Jobs[0].End);
Assert.Null(structured.Jobs[1].Location);
Assert.Null(structured.Jobs[1].Start);
Assert.Null(structured.Jobs[1].End);
}
[Fact] [Fact]
public async Task Parse_returns_structured_cv_and_persists_it() public async Task Parse_returns_structured_cv_and_persists_it()
{ {
@@ -124,6 +124,10 @@ public sealed class AdminSystemController : ControllerBase
GpuName: null, GpuName: null,
OcrAvailable: false, OcrAvailable: false,
OcrLanguages: null, OcrLanguages: null,
OllamaConfigured: null,
OllamaReachable: null,
OllamaModel: null,
OllamaModelAvailable: null,
HealthLatencyMs: null, HealthLatencyMs: null,
ProbeLatencyMs: null, ProbeLatencyMs: null,
LastProbeAt: null, LastProbeAt: null,
@@ -61,13 +61,15 @@ public sealed class ProfileCvController : ControllerBase
private readonly UserManager<ApplicationUser> _users; private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService; private readonly ISummarizerService _aiService;
private readonly ICvAiClassifier _cvAiClassifier;
private readonly JobTrackerContext _db; private readonly JobTrackerContext _db;
private readonly AppPaths _paths; private readonly AppPaths _paths;
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths) public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null)
{ {
_users = users; _users = users;
_aiService = aiService; _aiService = aiService;
_cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance;
_db = db; _db = db;
_paths = paths; _paths = paths;
} }
@@ -338,14 +340,7 @@ public sealed class ProfileCvController : ControllerBase
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken) private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{ {
var parseSource = NormalizeTextForStructuredParsing(text); var parseSource = NormalizeTextForStructuredParsing(text);
var fallbackSections = ParseSections(parseSource) var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections); var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
AnnotateStructuredCv(sectionFallback, "repair", 0.56); AnnotateStructuredCv(sectionFallback, "repair", 0.56);
@@ -729,12 +724,19 @@ public sealed class ProfileCvController : ControllerBase
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content) private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
{ {
var languages = new List<StructuredCvLanguage>(); var languages = new List<StructuredCvLanguage>();
foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase)) var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+")
.Select(item => item.Trim())
.Where(item => item.Length > 1);
foreach (var candidate in candidates)
{ {
var name = NullIfWhitespace(match.Groups[1].Value); var level = HumanLanguageCatalog.ExtractLevel(candidate);
var level = NullIfWhitespace(match.Groups[2].Value); if (level is null) continue;
if (name is null) continue;
languages.Add(new StructuredCvLanguage { Name = name, Level = level }); foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate))
{
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
}
} }
return languages return languages
@@ -872,6 +874,86 @@ public sealed class ProfileCvController : ControllerBase
.ToList(); .ToList();
} }
private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var parsed = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
if (hasRealSections) return parsed;
var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
return aiSections.Count > 0 ? aiSections : parsed;
}
private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length >= 24)
.ToList();
if (blocks.Count == 0) return new List<StructuredCvSection>();
var sectionBuckets = new List<StructuredCvSection>();
foreach (var block in blocks)
{
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
var sectionName = classification?.Section;
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
{
sectionName = canonical;
}
if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase))
{
sectionName = "General";
}
var content = block;
if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null)
{
var lines = new List<string>();
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase);
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent);
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
if (classification.Bullets is not null)
{
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
}
if (lines.Count > 0) content = string.Join("\n", lines);
}
var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
if (existing is null)
{
sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
}
else
{
existing.Content = $"{existing.Content}\n\n{content}".Trim();
existing.WordCount = CountWords(existing.Content);
}
}
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
}
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
{
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
if (string.IsNullOrWhiteSpace(start)) return end;
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
}
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken) private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
{ {
var normalized = text.Trim(); var normalized = text.Trim();
+1
View File
@@ -132,6 +132,7 @@ builder.Services.AddHttpClient("ai-service", client =>
builder.Services.AddMemoryCache(); builder.Services.AddMemoryCache();
builder.Services.AddSingleton<ISummarizerService, SummarizerService>(); builder.Services.AddSingleton<ISummarizerService, SummarizerService>();
builder.Services.AddSingleton<ICvAiClassifier, CvAiClassifier>();
builder.Services.AddSingleton<IGoogleTokenValidator, GoogleTokenValidator>(); builder.Services.AddSingleton<IGoogleTokenValidator, GoogleTokenValidator>();
builder.Services.AddScoped<IGmailOAuthService, GmailOAuthService>(); builder.Services.AddScoped<IGmailOAuthService, GmailOAuthService>();
+65
View File
@@ -0,0 +1,65 @@
using System.Net.Http;
using System.Text;
using System.Text.Json;
namespace JobTrackerApi.Services;
public sealed record CvBlockClassificationResult(
string? Section,
double? Confidence,
string? Reason,
string? Title,
string? Company,
string? Location,
string? Start,
string? End,
List<string>? Bullets);
public interface ICvAiClassifier
{
Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default);
}
public sealed class CvAiClassifier : ICvAiClassifier
{
private readonly IHttpClientFactory _httpClientFactory;
public CvAiClassifier(IHttpClientFactory httpClientFactory)
{
_httpClientFactory = httpClientFactory;
}
public async Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(block)) return null;
try
{
var client = _httpClientFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { block });
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
using var response = await client.PostAsync("/cv/classify-block", content, cancellationToken);
if (!response.IsSuccessStatusCode) return null;
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
var parsed = await JsonSerializer.DeserializeAsync<CvBlockClassificationResult>(stream, new JsonSerializerOptions(JsonSerializerDefaults.Web)
{
PropertyNameCaseInsensitive = true
}, cancellationToken);
return parsed;
}
catch
{
return null;
}
}
}
public sealed class NoOpCvAiClassifier : ICvAiClassifier
{
public static NoOpCvAiClassifier Instance { get; } = new();
private NoOpCvAiClassifier() { }
public Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
=> Task.FromResult<CvBlockClassificationResult?>(null);
}
@@ -21,6 +21,10 @@ namespace JobTrackerApi.Services
string? GpuName, string? GpuName,
bool? OcrAvailable, bool? OcrAvailable,
string? OcrLanguages, string? OcrLanguages,
bool? OllamaConfigured,
bool? OllamaReachable,
string? OllamaModel,
bool? OllamaModelAvailable,
double? HealthLatencyMs, double? HealthLatencyMs,
double? ProbeLatencyMs, double? ProbeLatencyMs,
DateTimeOffset? LastProbeAt, DateTimeOffset? LastProbeAt,
@@ -310,6 +314,10 @@ namespace JobTrackerApi.Services
string? gpuName = null; string? gpuName = null;
bool? ocrAvailable = null; bool? ocrAvailable = null;
string? ocrLanguages = null; string? ocrLanguages = null;
bool? ollamaConfigured = null;
bool? ollamaReachable = null;
string? ollamaModel = null;
bool? ollamaModelAvailable = null;
double? healthLatencyMs = null; double? healthLatencyMs = null;
var healthy = false; var healthy = false;
string? healthError = null; string? healthError = null;
@@ -332,6 +340,10 @@ namespace JobTrackerApi.Services
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString(); if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean(); if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString(); if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
if (doc.RootElement.TryGetProperty("ollama_configured", out var ollamaConfiguredEl) && ollamaConfiguredEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaConfigured = ollamaConfiguredEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ollama_reachable", out var ollamaReachableEl) && ollamaReachableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaReachable = ollamaReachableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ollama_model", out var ollamaModelEl)) ollamaModel = ollamaModelEl.GetString();
if (doc.RootElement.TryGetProperty("ollama_model_available", out var ollamaModelAvailableEl) && ollamaModelAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaModelAvailable = ollamaModelAvailableEl.GetBoolean();
} }
else else
{ {
@@ -390,6 +402,10 @@ namespace JobTrackerApi.Services
GpuName: gpuName, GpuName: gpuName,
OcrAvailable: ocrAvailable, OcrAvailable: ocrAvailable,
OcrLanguages: ocrLanguages, OcrLanguages: ocrLanguages,
OllamaConfigured: ollamaConfigured,
OllamaReachable: ollamaReachable,
OllamaModel: ollamaModel,
OllamaModelAvailable: ollamaModelAvailable,
HealthLatencyMs: healthLatencyMs, HealthLatencyMs: healthLatencyMs,
ProbeLatencyMs: probeLatencyMs, ProbeLatencyMs: probeLatencyMs,
LastProbeAt: lastProbeAt, LastProbeAt: lastProbeAt,
+162
View File
@@ -0,0 +1,162 @@
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;
namespace JobTrackerApi.Models;
public static class HumanLanguageCatalog
{
private static readonly Dictionary<string, string> LanguageLookup = BuildLanguageLookup();
private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled);
private static readonly Regex LevelRegex = new(
@"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
public static string? NormalizeLanguageName(string? raw)
{
var matches = ExtractLanguageNames(raw);
return matches.Count == 1 ? matches[0] : null;
}
public static IReadOnlyList<string> ExtractLanguageNames(string? raw)
{
if (string.IsNullOrWhiteSpace(raw)) return Array.Empty<string>();
var words = WordRegex.Matches(raw)
.Select(match => match.Value)
.Where(value => !string.IsNullOrWhiteSpace(value))
.ToList();
if (words.Count == 0) return Array.Empty<string>();
var matches = new List<(int Start, int Size, string Canonical)>();
for (var size = Math.Min(4, words.Count); size >= 1; size--)
{
for (var start = 0; start <= words.Count - size; start++)
{
var phrase = string.Join(" ", words.Skip(start).Take(size));
if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue;
if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue;
matches.Add((start, size, canonical));
}
}
return matches
.OrderBy(match => match.Start)
.Select(match => match.Canonical)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
public static bool HasRecognizedLevel(string? raw)
{
return ExtractLevel(raw) is not null;
}
public static string? ExtractLevel(string? raw)
{
if (string.IsNullOrWhiteSpace(raw)) return null;
var match = LevelRegex.Match(raw);
if (!match.Success) return null;
var value = match.Groups[1].Value.Trim();
var compact = Regex.Replace(value, @"\s+", " ");
return compact.ToLowerInvariant() switch
{
"native speaker" => "Native",
"native" => "Native",
"fluent" => "Fluent",
"advanced" => "Advanced",
"intermediate" => "Intermediate",
"beginner" => "Beginner",
"basic" => "Basic",
"conversational" => "Conversational",
"elementary" => "Elementary",
"professional working proficiency" => "Professional working proficiency",
"working proficiency" => "Working proficiency",
"limited working proficiency" => "Limited working proficiency",
"full professional proficiency" => "Full professional proficiency",
_ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty),
_ => compact,
};
}
private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB)
{
var endA = startA + sizeA;
var endB = startB + sizeB;
return startA < endB && startB < endA;
}
private static Dictionary<string, string> BuildLanguageLookup()
{
var map = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
void Add(string? alias, string? canonical)
{
var normalizedAlias = NormalizeKey(alias);
var normalizedCanonical = NormalizeDisplayName(canonical);
if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return;
map.TryAdd(normalizedAlias, normalizedCanonical);
}
foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures))
{
var english = CleanCultureLanguageName(culture.EnglishName);
var native = CleanCultureLanguageName(culture.NativeName);
Add(english, english);
Add(native, english);
}
Add("norsk", "Norwegian");
Add("bokmal", "Norwegian");
Add("bokmål", "Norwegian");
Add("nynorsk", "Norwegian");
Add("mandarin", "Chinese");
Add("cantonese", "Chinese");
Add("farsi", "Persian");
Add("persian", "Persian");
return map;
}
private static string? CleanCultureLanguageName(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
var cleaned = value.Trim();
var parenIndex = cleaned.IndexOf('(');
if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim();
var commaIndex = cleaned.IndexOf(',');
if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim();
return NormalizeDisplayName(cleaned);
}
private static string? NormalizeDisplayName(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
var cleaned = Regex.Replace(value.Trim(), @"\s+", " ");
return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
? word
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()));
}
private static string NormalizeKey(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
var decomposed = value.Trim().Normalize(NormalizationForm.FormD);
var builder = new StringBuilder(decomposed.Length);
foreach (var ch in decomposed)
{
if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue;
builder.Append(char.ToLowerInvariant(ch));
}
return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim();
}
}
+207 -10
View File
@@ -144,7 +144,7 @@ public static class StructuredCvProfileJson
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim(); profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
profile.Metadata ??= new StructuredCvMetadata(); profile.Metadata ??= new StructuredCvMetadata();
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>(); profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
profile.Contact ??= new StructuredCvContact(); profile.Contact = NormalizeContact(profile.Contact);
profile.Summary = CleanList(profile.Summary); profile.Summary = CleanList(profile.Summary);
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>()) profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
.Select(NormalizeJob) .Select(NormalizeJob)
@@ -178,20 +178,206 @@ public static class StructuredCvProfileJson
return profile; return profile;
} }
private static StructuredCvContact NormalizeContact(StructuredCvContact? contact)
{
contact ??= new StructuredCvContact();
contact.FullName = TrimOrNull(contact.FullName);
contact.Headline = TrimOrNull(contact.Headline);
contact.Email = TrimOrNull(contact.Email);
contact.Phone = TrimOrNull(contact.Phone);
contact.Location = NormalizeLocationValue(contact.Location);
contact.Website = NormalizeWebsite(contact.Website);
contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn);
return contact;
}
private static StructuredCvJob NormalizeJob(StructuredCvJob? job) private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
{ {
job ??= new StructuredCvJob(); job ??= new StructuredCvJob();
job.Title = TrimOrNull(job.Title);
job.Company = TrimOrNull(job.Company); var title = NormalizeJobTitle(job.Title);
job.Location = TrimOrNull(job.Location); var company = NormalizeCompanyName(job.Company);
job.Start = TrimOrNull(job.Start); var location = NormalizeLocationValue(job.Location);
job.End = TrimOrNull(job.End);
job.Bullets = CleanList(job.Bullets); if (!string.IsNullOrWhiteSpace(title) && company is null)
{
var atSplit = Regex.Match(title, @"^(?<title>.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase);
if (atSplit.Success)
{
title = NormalizeJobTitle(atSplit.Groups["title"].Value);
company = NormalizeCompanyName(atSplit.Groups["company"].Value);
}
}
if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company))
{
var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title);
var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company);
if (titleLooksLikeCompany && companyLooksLikeTitle)
{
(title, company) = (company, title);
}
}
if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title))
{
if (company is null) company = title;
title = null;
}
if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null)
{
title = company;
company = null;
}
job.Title = title;
job.Company = company;
job.Location = location;
job.Start = NormalizeDateValue(job.Start);
job.End = NormalizeDateValue(job.End);
job.Bullets = CleanList(job.Bullets)
.Select(NormalizeBullet)
.Where(bullet => bullet is not null)
.Select(bullet => bullet!)
.Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company))
.ToList();
job.Skills = CleanList(job.Skills); job.Skills = CleanList(job.Skills);
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase); job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
return job; return job;
} }
private static string? NormalizeBullet(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
return value.Trim().TrimStart('-', '•', '*', ' ');
}
private static bool IsUsefulJobBullet(string? value, string? title, string? company)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return false;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false;
if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false;
if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false;
if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false;
return true;
}
private static string? NormalizeJobTitle(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static string? NormalizeCompanyName(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null;
if (trimmed.Contains('.') && trimmed.Contains(' ')) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static string? NormalizeLocationValue(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
if (parts.Length == 0 || parts.Length > 4) return null;
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'\-. ]+$"))) return null;
return string.Join(", ", parts);
}
private static string? NormalizeWebsite(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
var candidate = trimmed;
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
var host = uri.Host.Trim().Trim('.').ToLowerInvariant();
if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null;
return host;
}
private static string? NormalizeLinkedIn(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
var candidate = trimmed;
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
var path = uri.AbsolutePath.TrimEnd('/');
if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null;
return $"https://www.linkedin.com{path}";
}
private static string? NormalizeDateValue(string? value)
{
var trimmed = TrimOrNull(value);
return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null;
}
private static bool LooksLikeDateRange(string value)
{
return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
}
private static bool LooksLikeUrlOrEmail(string value)
{
return value.Contains('@')
|| value.Contains("www.", StringComparison.OrdinalIgnoreCase)
|| value.Contains("http://", StringComparison.OrdinalIgnoreCase)
|| value.Contains("https://", StringComparison.OrdinalIgnoreCase);
}
private static bool LooksLikeSectionHeading(string value)
{
return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Experience", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Employment History", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Education", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Skills", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Languages", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Interests", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Contact", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Summary", StringComparison.OrdinalIgnoreCase);
}
private static bool LooksLikeJobTitle(string value)
{
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase)
|| (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value));
}
private static bool LooksLikeCompanyName(string value)
{
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase)
|| value.Contains('&')
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
}
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education) private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
{ {
education ??= new StructuredCvEducation(); education ??= new StructuredCvEducation();
@@ -207,8 +393,13 @@ public static class StructuredCvProfileJson
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language) private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
{ {
language ??= new StructuredCvLanguage(); language ??= new StructuredCvLanguage();
language.Name = TrimOrNull(language.Name);
language.Level = TrimOrNull(language.Level); var originalName = TrimOrNull(language.Name);
var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName);
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName);
language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null;
language.Level = normalizedLevel;
language.Notes = TrimOrNull(language.Notes); language.Notes = TrimOrNull(language.Notes);
return language; return language;
} }
@@ -360,7 +551,13 @@ public static class StructuredCvProfileJson
} }
} }
return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes }; var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
return new StructuredCvLanguage
{
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
Level = normalizedLevel,
Notes = notes,
};
}) })
.Where(language => !string.IsNullOrWhiteSpace(language.Name)) .Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList(); .ToList();
+8 -2
View File
@@ -25,7 +25,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
## Quickstart (Docker) ## Quickstart (Docker)
This runs: frontend (nginx), backend API, and the AI service. This runs: frontend (nginx), backend API, the local AI service, and an Ollama container for hybrid CV block classification.
1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`). 1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`).
@@ -108,9 +108,15 @@ The API calls a local FastAPI service to generate summaries. If its not runni
With Docker (recommended): With Docker (recommended):
```bash ```bash
docker compose up --build ai-service # One command for local Ollama startup + pull + AI-service restart
OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
# Then start the rest of the app if needed
docker compose up --build -d backend frontend
``` ```
The first Ollama startup is usually quick, but the first model pull and first generation can take a while. After the model is cached in the `ollama_data` volume, later restarts are much faster.
Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`). Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`).
## Configuration ## Configuration
+5 -1
View File
@@ -52,6 +52,8 @@ AUTH_ADMIN_EMAIL=you@example.com
AUTH_ADMIN_PASSWORD=replace_with_strong_password AUTH_ADMIN_PASSWORD=replace_with_strong_password
APP_PUBLIC_BASE_URL=https://your-domain.example APP_PUBLIC_BASE_URL=https://your-domain.example
AI_SERVICE_BASE_URL=http://ai-service:8001 AI_SERVICE_BASE_URL=http://ai-service:8001
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_MODEL=qwen2.5:7b
EMAIL_FOLLOWUPREMINDERS_ENABLED=true EMAIL_FOLLOWUPREMINDERS_ENABLED=true
EMAIL_FOLLOWUPREMINDERS_UPCOMINGDAYS=2 EMAIL_FOLLOWUPREMINDERS_UPCOMINGDAYS=2
# Optional backward-compatible alias if older config still references the previous name: # Optional backward-compatible alias if older config still references the previous name:
@@ -87,7 +89,8 @@ If this app is going to be a real production service on Ubuntu:
2. Gitea Actions runs tests 2. Gitea Actions runs tests
3. if green, workflow uploads repo to server 3. if green, workflow uploads repo to server
4. `deploy/deploy.sh` links `/opt/job-tracker/shared/.env` into the repo checkout, then runs `docker compose build && docker compose up -d` 4. `deploy/deploy.sh` links `/opt/job-tracker/shared/.env` into the repo checkout, then runs `docker compose build && docker compose up -d`
5. workflow checks service status after deployment 5. if `OLLAMA_MODEL` is set, the deploy script waits for Ollama, pulls the configured model if missing, then restarts `ai-service` so hybrid CV classification can use it
6. workflow checks service status after deployment
## Post-deploy verification you should also do manually the first time ## Post-deploy verification you should also do manually the first time
- confirm reverse proxy routes to the frontend correctly - confirm reverse proxy routes to the frontend correctly
@@ -96,3 +99,4 @@ If this app is going to be a real production service on Ubuntu:
- confirm AI service container is reachable from backend - confirm AI service container is reachable from backend
- confirm reminder and admin/system pages load - confirm reminder and admin/system pages load
- verify follow-up reminder emails are enabled only when intended and that links open the correct job/tab - verify follow-up reminder emails are enabled only when intended and that links open the correct job/tab
hat links open the correct job/tab
+5
View File
@@ -45,6 +45,11 @@ build_with_recovery
# Force recreation so updated port mappings, env vars, and container config always apply on deploy. # Force recreation so updated port mappings, env vars, and container config always apply on deploy.
compose up -d --force-recreate --remove-orphans compose up -d --force-recreate --remove-orphans
if [ -n "${OLLAMA_MODEL:-}" ]; then
echo "Post-deploy Ollama warmup enabled for model: ${OLLAMA_MODEL}"
./scripts/start-ollama-cv.sh
fi
sleep 5 sleep 5
compose ps compose ps
+26
View File
@@ -71,8 +71,13 @@ services:
build: build:
context: ./tools/summarizer context: ./tools/summarizer
dockerfile: Dockerfile dockerfile: Dockerfile
environment:
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://ollama:11434}
- OLLAMA_MODEL=${OLLAMA_MODEL:-qwen2.5:7b}
ports: ports:
- "8001:8001" - "8001:8001"
depends_on:
- ollama
networks: networks:
- default - default
- shared_services - shared_services
@@ -83,8 +88,29 @@ services:
timeout: 10s timeout: 10s
retries: 3 retries: 3
ollama:
image: ollama/ollama:latest
ports:
- "11434:11434"
environment:
- OLLAMA_HOST=0.0.0.0:11434
volumes:
- ollama_data:/root/.ollama
networks:
- default
- shared_services
restart: unless-stopped
gpus: all
healthcheck:
test: ["CMD", "ollama", "list"]
interval: 20s
timeout: 15s
retries: 10
start_period: 20s
volumes: volumes:
jobtracker_data: jobtracker_data:
ollama_data:
networks: networks:
shared_services: shared_services:
+33 -14
View File
@@ -1,8 +1,9 @@
import React, { useCallback, useEffect, useMemo, useRef, useState } from "react"; import React, { useCallback, useEffect, useMemo, useRef, useState } from "react";
import { Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material"; import { Accordion, AccordionDetails, AccordionSummary, Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material";
import DeleteOutlineIcon from "@mui/icons-material/DeleteOutline"; import DeleteOutlineIcon from "@mui/icons-material/DeleteOutline";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import PhotoCameraOutlinedIcon from "@mui/icons-material/PhotoCameraOutlined"; import PhotoCameraOutlinedIcon from "@mui/icons-material/PhotoCameraOutlined";
import { api } from "../api"; import { api } from "../api";
@@ -399,22 +400,40 @@ export default function ProfilePage() {
> >
{reprocessingCv ? t("profileCvReprocessing") : t("profileCvReprocess")} {reprocessingCv ? t("profileCvReprocessing") : t("profileCvReprocess")}
</Button> </Button>
<Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
{t("profileCopyCvText")}
</Button>
</Box> </Box>
</Box> </Box>
{uploadingCv ? <LinearProgress sx={{ mb: 1.5 }} /> : null} {uploadingCv ? <LinearProgress sx={{ mb: 1.5 }} /> : null}
<TextField <Alert severity="info" sx={{ mb: 2, borderRadius: 2.5 }}>
label={t("profileCvTextLabel")} {t("profileCvStructuredDefaultHint")}
value={profileCvText} </Alert>
onChange={(e) => setProfileCvText(e.target.value)} <Accordion disableGutters elevation={0} sx={{ mb: 2, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper", "&:before": { display: "none" } }}>
helperText={t("profileCvTextHelp")} <AccordionSummary expandIcon={<ExpandMoreIcon />}>
multiline <Box sx={{ display: "flex", justifyContent: "space-between", gap: 1.5, alignItems: "center", width: "100%", pr: 1 }}>
minRows={12} <Box>
disabled={!isLocal} <Typography variant="subtitle1" sx={{ fontWeight: 800 }}>{t("profileCvRawPanelTitle")}</Typography>
fullWidth <Typography variant="body2" sx={{ color: "text.secondary" }}>{t("profileCvRawPanelHelp")}</Typography>
/> </Box>
<Chip size="small" label={t("profileCvSectionWordCount", { count: cvWordCount })} />
</Box>
</AccordionSummary>
<AccordionDetails>
<TextField
label={t("profileCvTextLabel")}
value={profileCvText}
onChange={(e) => setProfileCvText(e.target.value)}
helperText={t("profileCvTextHelp")}
multiline
minRows={12}
disabled={!isLocal}
fullWidth
/>
<Box sx={{ mt: 1.5, display: "flex", justifyContent: "flex-end" }}>
<Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
{t("profileCopyCvText")}
</Button>
</Box>
</AccordionDetails>
</Accordion>
<Box sx={{ mt: 2, p: 1.5, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper" }}> <Box sx={{ mt: 2, p: 1.5, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper" }}>
<Box sx={{ display: "flex", justifyContent: "space-between", gap: 2, flexWrap: "wrap", alignItems: "center", mb: 1.5 }}> <Box sx={{ display: "flex", justifyContent: "space-between", gap: 2, flexWrap: "wrap", alignItems: "center", mb: 1.5 }}>
<Box> <Box>
+7
View File
@@ -147,10 +147,17 @@ test('profile page loads persisted structured cv and can re-parse it', async ()
expect(screen.getByText(/extraction history/i)).toBeInTheDocument(); expect(screen.getByText(/extraction history/i)).toBeInTheDocument();
expect(screen.getByText(/resume.pdf/i)).toBeInTheDocument(); expect(screen.getByText(/resume.pdf/i)).toBeInTheDocument();
expect(screen.getByText(/current run/i)).toBeInTheDocument(); expect(screen.getByText(/current run/i)).toBeInTheDocument();
expect(screen.getAllByText(/original extraction/i).length).toBeGreaterThan(0);
const originalExtractionToggle = screen.getByRole('button', { name: /original extraction/i });
expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'false');
expect(screen.getAllByText(/professional summary/i).length).toBeGreaterThan(0); expect(screen.getAllByText(/professional summary/i).length).toBeGreaterThan(0);
expect(screen.getByLabelText(/full name/i)).toHaveValue('Demo User'); expect(screen.getByLabelText(/full name/i)).toHaveValue('Demo User');
expect(screen.getByText(/high 92%/i)).toBeInTheDocument(); expect(screen.getByText(/high 92%/i)).toBeInTheDocument();
fireEvent.click(originalExtractionToggle);
expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'true');
expect(await screen.findByLabelText(/profile cv \/ master resume text/i)).toHaveValue('Professional Summary\nBuilt backend systems');
const analyzeButton = screen.getByRole('button', { name: /analyze sections/i }); const analyzeButton = screen.getByRole('button', { name: /analyze sections/i });
await waitFor(() => expect(analyzeButton).toBeEnabled()); await waitFor(() => expect(analyzeButton).toBeEnabled());
fireEvent.click(analyzeButton); fireEvent.click(analyzeButton);
+79
View File
@@ -0,0 +1,79 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
MODEL="${OLLAMA_MODEL:-qwen2.5:7b}"
OLLAMA_WAIT_SECONDS="${OLLAMA_WAIT_SECONDS:-180}"
PULL_WAIT_SECONDS="${OLLAMA_PULL_WAIT_SECONDS:-1800}"
compose() {
docker compose "$@"
}
wait_for_ollama() {
local deadline=$((SECONDS + OLLAMA_WAIT_SECONDS))
while [ "$SECONDS" -lt "$deadline" ]; do
if compose exec -T ollama ollama list >/dev/null 2>&1; then
return 0
fi
sleep 3
done
return 1
}
model_present() {
compose exec -T ollama ollama list 2>/dev/null | awk 'NR>1 {print $1}' | grep -Fx "$MODEL" >/dev/null 2>&1
}
wait_for_model() {
local deadline=$((SECONDS + PULL_WAIT_SECONDS))
while [ "$SECONDS" -lt "$deadline" ]; do
if model_present; then
return 0
fi
sleep 5
done
return 1
}
echo "Starting Ollama service..."
compose up -d ollama
if ! wait_for_ollama; then
echo "Ollama did not become ready within ${OLLAMA_WAIT_SECONDS}s."
compose logs --tail=200 ollama || true
exit 1
fi
echo "Ollama is responding."
if model_present; then
echo "Model already present: $MODEL"
else
echo "Pulling Ollama model: $MODEL"
compose exec -T ollama ollama pull "$MODEL" || {
echo "Model pull command failed."
compose logs --tail=200 ollama || true
exit 1
}
fi
if ! wait_for_model; then
echo "Model ${MODEL} did not appear within ${PULL_WAIT_SECONDS}s."
compose exec -T ollama ollama list || true
exit 1
fi
echo "Ollama model ready: $MODEL"
echo "Restarting AI service so it can use the ready Ollama model."
compose up -d ai-service
if ! compose ps ai-service --format '{{.State}}' 2>/dev/null | head -n 1 | tr '[:upper:]' '[:lower:]' | grep -qx 'running'; then
echo "AI service is not running after Ollama warmup."
compose logs --tail=200 ai-service || true
exit 1
fi
echo "Ollama warmup complete."
+25 -2
View File
@@ -8,6 +8,7 @@ This service runs a local Hugging Face summarization model and also exposes docu
- OCR fallback for scanned PDFs - OCR fallback for scanned PDFs
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`) - OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
- DOCX / TXT / MD extraction - DOCX / TXT / MD extraction
- optional Ollama-backed CV block classification for harder sectioning
## Install ## Install
@@ -36,8 +37,30 @@ The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can b
- `GET /health` — health check and runtime capabilities - `GET /health` — health check and runtime capabilities
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` - `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata - `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
- `POST /cv/classify-block` — JSON body `{ "block": "..." }`, uses Ollama when `OLLAMA_MODEL` is configured
## Notes ## Ollama
- Model weights are downloaded on first run. Set these before starting the service if you want the hybrid CV classifier enabled:
```bash
export OLLAMA_BASE_URL=http://ollama:11434
export OLLAMA_MODEL=qwen2.5:7b
```
Choose the model by setting `OLLAMA_MODEL` and then warming it with the helper script:
```bash
OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
```
Equivalent manual flow:
```bash
docker compose up -d ollama
docker compose exec ollama ollama pull qwen2.5:7b
docker compose up -d ai-service
```
- Model weights are downloaded on first pull.
- OCR quality depends on scan quality and language support. - OCR quality depends on scan quality and language support.
- Default OCR language is English (`eng`). - Default OCR language is English (`eng`).
+130
View File
@@ -8,9 +8,13 @@ from docx import Document
import fitz import fitz
import hashlib import hashlib
import io import io
import json
import os
import re import re
import torch import torch
import pytesseract import pytesseract
from urllib import request as urllib_request
from urllib.error import URLError, HTTPError
app = FastAPI(title="Local AI Service") app = FastAPI(title="Local AI Service")
@@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024 MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
OCR_LANGUAGES = "eng" OCR_LANGUAGES = "eng"
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"} IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
def _load_runtime(): def _load_runtime():
@@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel):
top_skills: int = Field(default=8, ge=3, le=12) top_skills: int = Field(default=8, ge=3, le=12)
class CvClassifyBlockRequest(BaseModel):
block: str = Field(min_length=1, max_length=6000)
def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str: def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
h = hashlib.sha256(text.encode("utf-8")).hexdigest() h = hashlib.sha256(text.encode("utf-8")).hexdigest()
return f"{h}:{max_length}:{min_length}:{top_skills}" return f"{h}:{max_length}:{min_length}:{top_skills}"
def _ollama_status():
configured = bool(OLLAMA_MODEL)
if not configured:
return {
"ollama_configured": False,
"ollama_reachable": False,
"ollama_model": None,
"ollama_model_available": False,
}
req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET")
try:
with urllib_request.urlopen(req, timeout=5) as response:
body = json.loads(response.read().decode("utf-8"))
except Exception:
return {
"ollama_configured": True,
"ollama_reachable": False,
"ollama_model": OLLAMA_MODEL,
"ollama_model_available": False,
}
models = body.get("models") or []
names = {item.get("name") for item in models if isinstance(item, dict)}
return {
"ollama_configured": True,
"ollama_reachable": True,
"ollama_model": OLLAMA_MODEL,
"ollama_model_available": OLLAMA_MODEL in names,
}
@app.get("/health") @app.get("/health")
async def health(): async def health():
return { return {
@@ -59,6 +101,7 @@ async def health():
"gpu_name": GPU_NAME, "gpu_name": GPU_NAME,
"ocr_available": True, "ocr_available": True,
"ocr_languages": OCR_LANGUAGES, "ocr_languages": OCR_LANGUAGES,
**_ollama_status(),
} }
@@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip() return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
def _ollama_generate_json(prompt: str):
if not OLLAMA_MODEL:
raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.")
payload = json.dumps({
"model": OLLAMA_MODEL,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {"temperature": 0.1}
}).encode("utf-8")
req = urllib_request.Request(
f"{OLLAMA_BASE_URL}/api/generate",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib_request.urlopen(req, timeout=30) as response:
body = json.loads(response.read().decode("utf-8"))
except HTTPError as ex:
raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
except URLError as ex:
raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.")
raw = (body.get("response") or "").strip()
if not raw:
raise HTTPException(status_code=502, detail="Ollama returned an empty response.")
try:
return json.loads(raw)
except json.JSONDecodeError:
start = raw.find("{")
end = raw.rfind("}")
if start >= 0 and end > start:
return json.loads(raw[start:end + 1])
raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")
@app.post("/cv/classify-block")
async def classify_cv_block(req: CvClassifyBlockRequest):
prompt = f"""
You classify one CV text block into structured JSON.
Return ONLY valid JSON with this exact shape:
{{
"section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other",
"confidence": 0.0,
"reason": "short reason",
"title": string|null,
"company": string|null,
"location": string|null,
"start": string|null,
"end": string|null,
"bullets": string[]
}}
Rules:
- Preserve facts only.
- section must be one of the listed values.
- Use Work Experience only for job/employment blocks.
- For Contact blocks, keep title/company/start/end null and bullets empty.
- For non-work blocks, title/company/start/end should usually be null.
- location must look like a place, not a sentence.
- dates must be one of: year, month+year, dd/mm/yyyy, Present, Current.
- bullets should only be job tasks/achievements, not titles, companies, dates, or headings.
- If unsure, choose Other and keep fields null/empty.
Block:
{req.block.strip()}
""".strip()
parsed = _ollama_generate_json(prompt)
return {
"section": parsed.get("section") or "Other",
"confidence": parsed.get("confidence"),
"reason": parsed.get("reason"),
"title": parsed.get("title"),
"company": parsed.get("company"),
"location": parsed.get("location"),
"start": parsed.get("start"),
"end": parsed.get("end"),
"bullets": parsed.get("bullets") or [],
}
@app.post("/summarize") @app.post("/summarize")
async def summarize(req: SummarizeRequest): async def summarize(req: SummarizeRequest):
if req.min_length >= req.max_length: if req.min_length >= req.max_length: