Improve CV parsing and profile editor flow
This commit is contained in:
@@ -9,6 +9,9 @@ GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET
|
||||
# Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback
|
||||
GOOGLE_GMAIL_REDIRECT_URI=
|
||||
AI_SERVICE_BASE_URL=http://ai-service:8001
|
||||
# Optional: enables hybrid CV block classification in the local AI service.
|
||||
OLLAMA_BASE_URL=http://ollama:11434
|
||||
OLLAMA_MODEL=qwen2.5:7b
|
||||
|
||||
# Optional: only needed if you want the UI to call a non-default API base URL.
|
||||
# In production the UI defaults to `/api`.
|
||||
|
||||
@@ -280,7 +280,7 @@ public sealed class ProfileCvControllerTests
|
||||
[Fact]
|
||||
public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid()
|
||||
{
|
||||
var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1.";
|
||||
var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1, C#, SQL, and public speaking.";
|
||||
|
||||
var user = new ApplicationUser { Id = "user-1" };
|
||||
var userManager = CreateUserManager();
|
||||
@@ -320,9 +320,164 @@ public sealed class ProfileCvControllerTests
|
||||
Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("C#", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("SQL", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Contains("public speaking", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.DoesNotContain(structured.Sections, section => section.Name == "General");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Structured_cv_normalization_keeps_human_languages_and_drops_skill_noise()
|
||||
{
|
||||
var structured = StructuredCvProfileJson.Deserialize("""
|
||||
{
|
||||
"version": "1",
|
||||
"contact": {},
|
||||
"summary": [],
|
||||
"jobs": [],
|
||||
"education": [],
|
||||
"skills": [],
|
||||
"languages": [
|
||||
{ "name": "English", "level": "Native" },
|
||||
{ "name": "Native Norwegian speaker", "level": null },
|
||||
{ "name": "French", "level": null },
|
||||
{ "name": "C#", "level": "Advanced" },
|
||||
{ "name": "Leadership", "level": null }
|
||||
],
|
||||
"interests": [],
|
||||
"otherSections": []
|
||||
}
|
||||
""");
|
||||
|
||||
Assert.Collection(
|
||||
structured.Languages.OrderBy(item => item.Name, StringComparer.OrdinalIgnoreCase),
|
||||
first =>
|
||||
{
|
||||
Assert.Equal("English", first.Name);
|
||||
Assert.Equal("Native", first.Level);
|
||||
},
|
||||
second =>
|
||||
{
|
||||
Assert.Equal("Norwegian", second.Name);
|
||||
Assert.Equal("Native", second.Level);
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Structured_cv_normalization_separates_job_title_company_and_tasks()
|
||||
{
|
||||
var structured = StructuredCvProfileJson.Deserialize("""
|
||||
{
|
||||
"version": "1",
|
||||
"contact": {},
|
||||
"summary": [],
|
||||
"jobs": [
|
||||
{
|
||||
"title": "Acme Ltd",
|
||||
"company": "Senior Backend Developer",
|
||||
"location": "Oslo",
|
||||
"start": "2022",
|
||||
"end": "2024",
|
||||
"isCurrent": false,
|
||||
"bullets": [
|
||||
"Senior Backend Developer",
|
||||
"Acme Ltd",
|
||||
"2022 - 2024",
|
||||
"Built API integrations for recruiter workflows and reduced manual follow-up churn."
|
||||
],
|
||||
"skills": [".NET", "SQL"]
|
||||
},
|
||||
{
|
||||
"title": "Lead Engineer at Northwind Council",
|
||||
"company": null,
|
||||
"location": "Remote",
|
||||
"start": "2020",
|
||||
"end": "Present",
|
||||
"isCurrent": true,
|
||||
"bullets": [
|
||||
"Led platform delivery across case-management and reporting surfaces.",
|
||||
"Skills: C#, SQL"
|
||||
],
|
||||
"skills": ["C#", "SQL"]
|
||||
}
|
||||
],
|
||||
"education": [],
|
||||
"skills": [],
|
||||
"languages": [],
|
||||
"interests": [],
|
||||
"otherSections": []
|
||||
}
|
||||
""");
|
||||
|
||||
Assert.Collection(
|
||||
structured.Jobs,
|
||||
first =>
|
||||
{
|
||||
Assert.Equal("Senior Backend Developer", first.Title);
|
||||
Assert.Equal("Acme Ltd", first.Company);
|
||||
Assert.Equal(new[] { "Built API integrations for recruiter workflows and reduced manual follow-up churn." }, first.Bullets);
|
||||
},
|
||||
second =>
|
||||
{
|
||||
Assert.Equal("Lead Engineer", second.Title);
|
||||
Assert.Equal("Northwind Council", second.Company);
|
||||
Assert.Equal(new[] { "Led platform delivery across case-management and reporting surfaces." }, second.Bullets);
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Structured_cv_normalization_hardens_contact_links_locations_and_dates()
|
||||
{
|
||||
var structured = StructuredCvProfileJson.Deserialize("""
|
||||
{
|
||||
"version": "1",
|
||||
"contact": {
|
||||
"location": "Tønsberg, Norway",
|
||||
"website": "https://cesnimda.co.uk/about",
|
||||
"linkedin": "linkedin.com/in/demo-user?trk=foo"
|
||||
},
|
||||
"summary": [],
|
||||
"jobs": [
|
||||
{
|
||||
"title": "System Developer",
|
||||
"company": "Warwickshire County Council",
|
||||
"location": "Warwickshire, England, UK",
|
||||
"start": "Sept 2023",
|
||||
"end": "1/1/2024",
|
||||
"isCurrent": false,
|
||||
"bullets": ["Built APIs"],
|
||||
"skills": []
|
||||
},
|
||||
{
|
||||
"title": "Developer",
|
||||
"company": "Demo Co",
|
||||
"location": "Remote 123",
|
||||
"start": "Spring 2024",
|
||||
"end": "Later",
|
||||
"isCurrent": false,
|
||||
"bullets": ["Kept services running"],
|
||||
"skills": []
|
||||
}
|
||||
],
|
||||
"education": [],
|
||||
"skills": [],
|
||||
"languages": [],
|
||||
"interests": [],
|
||||
"otherSections": []
|
||||
}
|
||||
""");
|
||||
|
||||
Assert.Equal("Tønsberg, Norway", structured.Contact.Location);
|
||||
Assert.Equal("cesnimda.co.uk", structured.Contact.Website);
|
||||
Assert.Equal("https://www.linkedin.com/in/demo-user", structured.Contact.LinkedIn);
|
||||
Assert.Equal("Warwickshire, England, UK", structured.Jobs[0].Location);
|
||||
Assert.Equal("Sept 2023", structured.Jobs[0].Start);
|
||||
Assert.Equal("1/1/2024", structured.Jobs[0].End);
|
||||
Assert.Null(structured.Jobs[1].Location);
|
||||
Assert.Null(structured.Jobs[1].Start);
|
||||
Assert.Null(structured.Jobs[1].End);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Parse_returns_structured_cv_and_persists_it()
|
||||
{
|
||||
|
||||
@@ -124,6 +124,10 @@ public sealed class AdminSystemController : ControllerBase
|
||||
GpuName: null,
|
||||
OcrAvailable: false,
|
||||
OcrLanguages: null,
|
||||
OllamaConfigured: null,
|
||||
OllamaReachable: null,
|
||||
OllamaModel: null,
|
||||
OllamaModelAvailable: null,
|
||||
HealthLatencyMs: null,
|
||||
ProbeLatencyMs: null,
|
||||
LastProbeAt: null,
|
||||
|
||||
@@ -61,13 +61,15 @@ public sealed class ProfileCvController : ControllerBase
|
||||
|
||||
private readonly UserManager<ApplicationUser> _users;
|
||||
private readonly ISummarizerService _aiService;
|
||||
private readonly ICvAiClassifier _cvAiClassifier;
|
||||
private readonly JobTrackerContext _db;
|
||||
private readonly AppPaths _paths;
|
||||
|
||||
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths)
|
||||
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null)
|
||||
{
|
||||
_users = users;
|
||||
_aiService = aiService;
|
||||
_cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance;
|
||||
_db = db;
|
||||
_paths = paths;
|
||||
}
|
||||
@@ -338,14 +340,7 @@ public sealed class ProfileCvController : ControllerBase
|
||||
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var parseSource = NormalizeTextForStructuredParsing(text);
|
||||
var fallbackSections = ParseSections(parseSource)
|
||||
.Select(section => new StructuredCvSection
|
||||
{
|
||||
Name = section.Name,
|
||||
Content = section.Content,
|
||||
WordCount = CountWords(section.Content),
|
||||
})
|
||||
.ToList();
|
||||
var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
|
||||
|
||||
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
|
||||
AnnotateStructuredCv(sectionFallback, "repair", 0.56);
|
||||
@@ -729,12 +724,19 @@ public sealed class ProfileCvController : ControllerBase
|
||||
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
|
||||
{
|
||||
var languages = new List<StructuredCvLanguage>();
|
||||
foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
|
||||
var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+")
|
||||
.Select(item => item.Trim())
|
||||
.Where(item => item.Length > 1);
|
||||
|
||||
foreach (var candidate in candidates)
|
||||
{
|
||||
var name = NullIfWhitespace(match.Groups[1].Value);
|
||||
var level = NullIfWhitespace(match.Groups[2].Value);
|
||||
if (name is null) continue;
|
||||
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
|
||||
var level = HumanLanguageCatalog.ExtractLevel(candidate);
|
||||
if (level is null) continue;
|
||||
|
||||
foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate))
|
||||
{
|
||||
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
|
||||
}
|
||||
}
|
||||
|
||||
return languages
|
||||
@@ -872,6 +874,86 @@ public sealed class ProfileCvController : ControllerBase
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
|
||||
{
|
||||
var parsed = ParseSections(parseSource)
|
||||
.Select(section => new StructuredCvSection
|
||||
{
|
||||
Name = section.Name,
|
||||
Content = section.Content,
|
||||
WordCount = CountWords(section.Content),
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
|
||||
if (hasRealSections) return parsed;
|
||||
|
||||
var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
|
||||
return aiSections.Count > 0 ? aiSections : parsed;
|
||||
}
|
||||
|
||||
private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
|
||||
{
|
||||
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
|
||||
.Select(block => block.Trim())
|
||||
.Where(block => block.Length >= 24)
|
||||
.ToList();
|
||||
|
||||
if (blocks.Count == 0) return new List<StructuredCvSection>();
|
||||
|
||||
var sectionBuckets = new List<StructuredCvSection>();
|
||||
foreach (var block in blocks)
|
||||
{
|
||||
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
|
||||
var sectionName = classification?.Section;
|
||||
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
|
||||
{
|
||||
sectionName = canonical;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
sectionName = "General";
|
||||
}
|
||||
|
||||
var content = block;
|
||||
if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null)
|
||||
{
|
||||
var lines = new List<string>();
|
||||
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
|
||||
var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase);
|
||||
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent);
|
||||
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||||
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
|
||||
if (classification.Bullets is not null)
|
||||
{
|
||||
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
|
||||
}
|
||||
if (lines.Count > 0) content = string.Join("\n", lines);
|
||||
}
|
||||
|
||||
var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
|
||||
if (existing is null)
|
||||
{
|
||||
sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
|
||||
}
|
||||
else
|
||||
{
|
||||
existing.Content = $"{existing.Content}\n\n{content}".Trim();
|
||||
existing.WordCount = CountWords(existing.Content);
|
||||
}
|
||||
}
|
||||
|
||||
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
|
||||
}
|
||||
|
||||
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
|
||||
if (string.IsNullOrWhiteSpace(start)) return end;
|
||||
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
|
||||
}
|
||||
|
||||
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var normalized = text.Trim();
|
||||
|
||||
@@ -132,6 +132,7 @@ builder.Services.AddHttpClient("ai-service", client =>
|
||||
|
||||
builder.Services.AddMemoryCache();
|
||||
builder.Services.AddSingleton<ISummarizerService, SummarizerService>();
|
||||
builder.Services.AddSingleton<ICvAiClassifier, CvAiClassifier>();
|
||||
builder.Services.AddSingleton<IGoogleTokenValidator, GoogleTokenValidator>();
|
||||
builder.Services.AddScoped<IGmailOAuthService, GmailOAuthService>();
|
||||
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
using System.Net.Http;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace JobTrackerApi.Services;
|
||||
|
||||
public sealed record CvBlockClassificationResult(
|
||||
string? Section,
|
||||
double? Confidence,
|
||||
string? Reason,
|
||||
string? Title,
|
||||
string? Company,
|
||||
string? Location,
|
||||
string? Start,
|
||||
string? End,
|
||||
List<string>? Bullets);
|
||||
|
||||
public interface ICvAiClassifier
|
||||
{
|
||||
Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
public sealed class CvAiClassifier : ICvAiClassifier
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
|
||||
public CvAiClassifier(IHttpClientFactory httpClientFactory)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
}
|
||||
|
||||
public async Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(block)) return null;
|
||||
|
||||
try
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("ai-service");
|
||||
var payload = JsonSerializer.Serialize(new { block });
|
||||
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
|
||||
using var response = await client.PostAsync("/cv/classify-block", content, cancellationToken);
|
||||
if (!response.IsSuccessStatusCode) return null;
|
||||
|
||||
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
|
||||
var parsed = await JsonSerializer.DeserializeAsync<CvBlockClassificationResult>(stream, new JsonSerializerOptions(JsonSerializerDefaults.Web)
|
||||
{
|
||||
PropertyNameCaseInsensitive = true
|
||||
}, cancellationToken);
|
||||
|
||||
return parsed;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class NoOpCvAiClassifier : ICvAiClassifier
|
||||
{
|
||||
public static NoOpCvAiClassifier Instance { get; } = new();
|
||||
private NoOpCvAiClassifier() { }
|
||||
public Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
|
||||
=> Task.FromResult<CvBlockClassificationResult?>(null);
|
||||
}
|
||||
@@ -21,6 +21,10 @@ namespace JobTrackerApi.Services
|
||||
string? GpuName,
|
||||
bool? OcrAvailable,
|
||||
string? OcrLanguages,
|
||||
bool? OllamaConfigured,
|
||||
bool? OllamaReachable,
|
||||
string? OllamaModel,
|
||||
bool? OllamaModelAvailable,
|
||||
double? HealthLatencyMs,
|
||||
double? ProbeLatencyMs,
|
||||
DateTimeOffset? LastProbeAt,
|
||||
@@ -310,6 +314,10 @@ namespace JobTrackerApi.Services
|
||||
string? gpuName = null;
|
||||
bool? ocrAvailable = null;
|
||||
string? ocrLanguages = null;
|
||||
bool? ollamaConfigured = null;
|
||||
bool? ollamaReachable = null;
|
||||
string? ollamaModel = null;
|
||||
bool? ollamaModelAvailable = null;
|
||||
double? healthLatencyMs = null;
|
||||
var healthy = false;
|
||||
string? healthError = null;
|
||||
@@ -332,6 +340,10 @@ namespace JobTrackerApi.Services
|
||||
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
|
||||
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
|
||||
if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
|
||||
if (doc.RootElement.TryGetProperty("ollama_configured", out var ollamaConfiguredEl) && ollamaConfiguredEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaConfigured = ollamaConfiguredEl.GetBoolean();
|
||||
if (doc.RootElement.TryGetProperty("ollama_reachable", out var ollamaReachableEl) && ollamaReachableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaReachable = ollamaReachableEl.GetBoolean();
|
||||
if (doc.RootElement.TryGetProperty("ollama_model", out var ollamaModelEl)) ollamaModel = ollamaModelEl.GetString();
|
||||
if (doc.RootElement.TryGetProperty("ollama_model_available", out var ollamaModelAvailableEl) && ollamaModelAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaModelAvailable = ollamaModelAvailableEl.GetBoolean();
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -390,6 +402,10 @@ namespace JobTrackerApi.Services
|
||||
GpuName: gpuName,
|
||||
OcrAvailable: ocrAvailable,
|
||||
OcrLanguages: ocrLanguages,
|
||||
OllamaConfigured: ollamaConfigured,
|
||||
OllamaReachable: ollamaReachable,
|
||||
OllamaModel: ollamaModel,
|
||||
OllamaModelAvailable: ollamaModelAvailable,
|
||||
HealthLatencyMs: healthLatencyMs,
|
||||
ProbeLatencyMs: probeLatencyMs,
|
||||
LastProbeAt: lastProbeAt,
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace JobTrackerApi.Models;
|
||||
|
||||
public static class HumanLanguageCatalog
|
||||
{
|
||||
private static readonly Dictionary<string, string> LanguageLookup = BuildLanguageLookup();
|
||||
|
||||
private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled);
|
||||
|
||||
private static readonly Regex LevelRegex = new(
|
||||
@"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
public static string? NormalizeLanguageName(string? raw)
|
||||
{
|
||||
var matches = ExtractLanguageNames(raw);
|
||||
return matches.Count == 1 ? matches[0] : null;
|
||||
}
|
||||
|
||||
public static IReadOnlyList<string> ExtractLanguageNames(string? raw)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(raw)) return Array.Empty<string>();
|
||||
|
||||
var words = WordRegex.Matches(raw)
|
||||
.Select(match => match.Value)
|
||||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||||
.ToList();
|
||||
|
||||
if (words.Count == 0) return Array.Empty<string>();
|
||||
|
||||
var matches = new List<(int Start, int Size, string Canonical)>();
|
||||
for (var size = Math.Min(4, words.Count); size >= 1; size--)
|
||||
{
|
||||
for (var start = 0; start <= words.Count - size; start++)
|
||||
{
|
||||
var phrase = string.Join(" ", words.Skip(start).Take(size));
|
||||
if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue;
|
||||
if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue;
|
||||
matches.Add((start, size, canonical));
|
||||
}
|
||||
}
|
||||
|
||||
return matches
|
||||
.OrderBy(match => match.Start)
|
||||
.Select(match => match.Canonical)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public static bool HasRecognizedLevel(string? raw)
|
||||
{
|
||||
return ExtractLevel(raw) is not null;
|
||||
}
|
||||
|
||||
public static string? ExtractLevel(string? raw)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(raw)) return null;
|
||||
|
||||
var match = LevelRegex.Match(raw);
|
||||
if (!match.Success) return null;
|
||||
|
||||
var value = match.Groups[1].Value.Trim();
|
||||
var compact = Regex.Replace(value, @"\s+", " ");
|
||||
return compact.ToLowerInvariant() switch
|
||||
{
|
||||
"native speaker" => "Native",
|
||||
"native" => "Native",
|
||||
"fluent" => "Fluent",
|
||||
"advanced" => "Advanced",
|
||||
"intermediate" => "Intermediate",
|
||||
"beginner" => "Beginner",
|
||||
"basic" => "Basic",
|
||||
"conversational" => "Conversational",
|
||||
"elementary" => "Elementary",
|
||||
"professional working proficiency" => "Professional working proficiency",
|
||||
"working proficiency" => "Working proficiency",
|
||||
"limited working proficiency" => "Limited working proficiency",
|
||||
"full professional proficiency" => "Full professional proficiency",
|
||||
_ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty),
|
||||
_ => compact,
|
||||
};
|
||||
}
|
||||
|
||||
private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB)
|
||||
{
|
||||
var endA = startA + sizeA;
|
||||
var endB = startB + sizeB;
|
||||
return startA < endB && startB < endA;
|
||||
}
|
||||
|
||||
private static Dictionary<string, string> BuildLanguageLookup()
|
||||
{
|
||||
var map = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
void Add(string? alias, string? canonical)
|
||||
{
|
||||
var normalizedAlias = NormalizeKey(alias);
|
||||
var normalizedCanonical = NormalizeDisplayName(canonical);
|
||||
if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return;
|
||||
map.TryAdd(normalizedAlias, normalizedCanonical);
|
||||
}
|
||||
|
||||
foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures))
|
||||
{
|
||||
var english = CleanCultureLanguageName(culture.EnglishName);
|
||||
var native = CleanCultureLanguageName(culture.NativeName);
|
||||
Add(english, english);
|
||||
Add(native, english);
|
||||
}
|
||||
|
||||
Add("norsk", "Norwegian");
|
||||
Add("bokmal", "Norwegian");
|
||||
Add("bokmål", "Norwegian");
|
||||
Add("nynorsk", "Norwegian");
|
||||
Add("mandarin", "Chinese");
|
||||
Add("cantonese", "Chinese");
|
||||
Add("farsi", "Persian");
|
||||
Add("persian", "Persian");
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
private static string? CleanCultureLanguageName(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||||
|
||||
var cleaned = value.Trim();
|
||||
var parenIndex = cleaned.IndexOf('(');
|
||||
if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim();
|
||||
var commaIndex = cleaned.IndexOf(',');
|
||||
if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim();
|
||||
return NormalizeDisplayName(cleaned);
|
||||
}
|
||||
|
||||
private static string? NormalizeDisplayName(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||||
var cleaned = Regex.Replace(value.Trim(), @"\s+", " ");
|
||||
return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
||||
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
|
||||
? word
|
||||
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()));
|
||||
}
|
||||
|
||||
private static string NormalizeKey(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
|
||||
var decomposed = value.Trim().Normalize(NormalizationForm.FormD);
|
||||
var builder = new StringBuilder(decomposed.Length);
|
||||
foreach (var ch in decomposed)
|
||||
{
|
||||
if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue;
|
||||
builder.Append(char.ToLowerInvariant(ch));
|
||||
}
|
||||
|
||||
return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim();
|
||||
}
|
||||
}
|
||||
@@ -144,7 +144,7 @@ public static class StructuredCvProfileJson
|
||||
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
|
||||
profile.Metadata ??= new StructuredCvMetadata();
|
||||
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
|
||||
profile.Contact ??= new StructuredCvContact();
|
||||
profile.Contact = NormalizeContact(profile.Contact);
|
||||
profile.Summary = CleanList(profile.Summary);
|
||||
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
|
||||
.Select(NormalizeJob)
|
||||
@@ -178,20 +178,206 @@ public static class StructuredCvProfileJson
|
||||
return profile;
|
||||
}
|
||||
|
||||
private static StructuredCvContact NormalizeContact(StructuredCvContact? contact)
|
||||
{
|
||||
contact ??= new StructuredCvContact();
|
||||
contact.FullName = TrimOrNull(contact.FullName);
|
||||
contact.Headline = TrimOrNull(contact.Headline);
|
||||
contact.Email = TrimOrNull(contact.Email);
|
||||
contact.Phone = TrimOrNull(contact.Phone);
|
||||
contact.Location = NormalizeLocationValue(contact.Location);
|
||||
contact.Website = NormalizeWebsite(contact.Website);
|
||||
contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn);
|
||||
return contact;
|
||||
}
|
||||
|
||||
private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
|
||||
{
|
||||
job ??= new StructuredCvJob();
|
||||
job.Title = TrimOrNull(job.Title);
|
||||
job.Company = TrimOrNull(job.Company);
|
||||
job.Location = TrimOrNull(job.Location);
|
||||
job.Start = TrimOrNull(job.Start);
|
||||
job.End = TrimOrNull(job.End);
|
||||
job.Bullets = CleanList(job.Bullets);
|
||||
|
||||
var title = NormalizeJobTitle(job.Title);
|
||||
var company = NormalizeCompanyName(job.Company);
|
||||
var location = NormalizeLocationValue(job.Location);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(title) && company is null)
|
||||
{
|
||||
var atSplit = Regex.Match(title, @"^(?<title>.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase);
|
||||
if (atSplit.Success)
|
||||
{
|
||||
title = NormalizeJobTitle(atSplit.Groups["title"].Value);
|
||||
company = NormalizeCompanyName(atSplit.Groups["company"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company))
|
||||
{
|
||||
var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title);
|
||||
var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company);
|
||||
if (titleLooksLikeCompany && companyLooksLikeTitle)
|
||||
{
|
||||
(title, company) = (company, title);
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title))
|
||||
{
|
||||
if (company is null) company = title;
|
||||
title = null;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null)
|
||||
{
|
||||
title = company;
|
||||
company = null;
|
||||
}
|
||||
|
||||
job.Title = title;
|
||||
job.Company = company;
|
||||
job.Location = location;
|
||||
job.Start = NormalizeDateValue(job.Start);
|
||||
job.End = NormalizeDateValue(job.End);
|
||||
job.Bullets = CleanList(job.Bullets)
|
||||
.Select(NormalizeBullet)
|
||||
.Where(bullet => bullet is not null)
|
||||
.Select(bullet => bullet!)
|
||||
.Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company))
|
||||
.ToList();
|
||||
job.Skills = CleanList(job.Skills);
|
||||
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||||
return job;
|
||||
}
|
||||
|
||||
private static string? NormalizeBullet(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||||
return value.Trim().TrimStart('-', '•', '*', ' ');
|
||||
}
|
||||
|
||||
private static bool IsUsefulJobBullet(string? value, string? title, string? company)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return false;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false;
|
||||
if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false;
|
||||
if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false;
|
||||
if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static string? NormalizeJobTitle(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
|
||||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeCompanyName(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||||
if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null;
|
||||
if (trimmed.Contains('.') && trimmed.Contains(' ')) return null;
|
||||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
|
||||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeLocationValue(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||||
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
|
||||
|
||||
var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||||
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||
if (parts.Length == 0 || parts.Length > 4) return null;
|
||||
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'’\-. ]+$"))) return null;
|
||||
|
||||
return string.Join(", ", parts);
|
||||
}
|
||||
|
||||
private static string? NormalizeWebsite(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
|
||||
|
||||
var candidate = trimmed;
|
||||
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
|
||||
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
|
||||
var host = uri.Host.Trim().Trim('.').ToLowerInvariant();
|
||||
if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null;
|
||||
return host;
|
||||
}
|
||||
|
||||
private static string? NormalizeLinkedIn(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
|
||||
var candidate = trimmed;
|
||||
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
|
||||
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
|
||||
if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
|
||||
|
||||
var path = uri.AbsolutePath.TrimEnd('/');
|
||||
if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null;
|
||||
return $"https://www.linkedin.com{path}";
|
||||
}
|
||||
|
||||
private static string? NormalizeDateValue(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null;
|
||||
}
|
||||
|
||||
private static bool LooksLikeDateRange(string value)
|
||||
{
|
||||
return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikeUrlOrEmail(string value)
|
||||
{
|
||||
return value.Contains('@')
|
||||
|| value.Contains("www.", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Contains("http://", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Contains("https://", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikeSectionHeading(string value)
|
||||
{
|
||||
return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Experience", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Employment History", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Education", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Skills", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Languages", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Interests", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Contact", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Summary", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikeJobTitle(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
|
||||
|
||||
return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase)
|
||||
|| (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value));
|
||||
}
|
||||
|
||||
private static bool LooksLikeCompanyName(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
|
||||
|
||||
return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase)
|
||||
|| value.Contains('&')
|
||||
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
|
||||
}
|
||||
|
||||
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
|
||||
{
|
||||
education ??= new StructuredCvEducation();
|
||||
@@ -207,8 +393,13 @@ public static class StructuredCvProfileJson
|
||||
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
|
||||
{
|
||||
language ??= new StructuredCvLanguage();
|
||||
language.Name = TrimOrNull(language.Name);
|
||||
language.Level = TrimOrNull(language.Level);
|
||||
|
||||
var originalName = TrimOrNull(language.Name);
|
||||
var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName);
|
||||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName);
|
||||
|
||||
language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null;
|
||||
language.Level = normalizedLevel;
|
||||
language.Notes = TrimOrNull(language.Notes);
|
||||
return language;
|
||||
}
|
||||
@@ -360,7 +551,13 @@ public static class StructuredCvProfileJson
|
||||
}
|
||||
}
|
||||
|
||||
return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes };
|
||||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
|
||||
return new StructuredCvLanguage
|
||||
{
|
||||
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
|
||||
Level = normalizedLevel,
|
||||
Notes = notes,
|
||||
};
|
||||
})
|
||||
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
|
||||
.ToList();
|
||||
|
||||
@@ -25,7 +25,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
|
||||
|
||||
## Quickstart (Docker)
|
||||
|
||||
This runs: frontend (nginx), backend API, and the AI service.
|
||||
This runs: frontend (nginx), backend API, the local AI service, and an Ollama container for hybrid CV block classification.
|
||||
|
||||
1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`).
|
||||
|
||||
@@ -108,9 +108,15 @@ The API calls a local FastAPI service to generate summaries. If it’s not runni
|
||||
With Docker (recommended):
|
||||
|
||||
```bash
|
||||
docker compose up --build ai-service
|
||||
# One command for local Ollama startup + pull + AI-service restart
|
||||
OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
|
||||
|
||||
# Then start the rest of the app if needed
|
||||
docker compose up --build -d backend frontend
|
||||
```
|
||||
|
||||
The first Ollama startup is usually quick, but the first model pull and first generation can take a while. After the model is cached in the `ollama_data` volume, later restarts are much faster.
|
||||
|
||||
Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`).
|
||||
|
||||
## Configuration
|
||||
|
||||
+5
-1
@@ -52,6 +52,8 @@ AUTH_ADMIN_EMAIL=you@example.com
|
||||
AUTH_ADMIN_PASSWORD=replace_with_strong_password
|
||||
APP_PUBLIC_BASE_URL=https://your-domain.example
|
||||
AI_SERVICE_BASE_URL=http://ai-service:8001
|
||||
OLLAMA_BASE_URL=http://ollama:11434
|
||||
OLLAMA_MODEL=qwen2.5:7b
|
||||
EMAIL_FOLLOWUPREMINDERS_ENABLED=true
|
||||
EMAIL_FOLLOWUPREMINDERS_UPCOMINGDAYS=2
|
||||
# Optional backward-compatible alias if older config still references the previous name:
|
||||
@@ -87,7 +89,8 @@ If this app is going to be a real production service on Ubuntu:
|
||||
2. Gitea Actions runs tests
|
||||
3. if green, workflow uploads repo to server
|
||||
4. `deploy/deploy.sh` links `/opt/job-tracker/shared/.env` into the repo checkout, then runs `docker compose build && docker compose up -d`
|
||||
5. workflow checks service status after deployment
|
||||
5. if `OLLAMA_MODEL` is set, the deploy script waits for Ollama, pulls the configured model if missing, then restarts `ai-service` so hybrid CV classification can use it
|
||||
6. workflow checks service status after deployment
|
||||
|
||||
## Post-deploy verification you should also do manually the first time
|
||||
- confirm reverse proxy routes to the frontend correctly
|
||||
@@ -96,3 +99,4 @@ If this app is going to be a real production service on Ubuntu:
|
||||
- confirm AI service container is reachable from backend
|
||||
- confirm reminder and admin/system pages load
|
||||
- verify follow-up reminder emails are enabled only when intended and that links open the correct job/tab
|
||||
hat links open the correct job/tab
|
||||
|
||||
@@ -45,6 +45,11 @@ build_with_recovery
|
||||
# Force recreation so updated port mappings, env vars, and container config always apply on deploy.
|
||||
compose up -d --force-recreate --remove-orphans
|
||||
|
||||
if [ -n "${OLLAMA_MODEL:-}" ]; then
|
||||
echo "Post-deploy Ollama warmup enabled for model: ${OLLAMA_MODEL}"
|
||||
./scripts/start-ollama-cv.sh
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
compose ps
|
||||
|
||||
|
||||
@@ -71,8 +71,13 @@ services:
|
||||
build:
|
||||
context: ./tools/summarizer
|
||||
dockerfile: Dockerfile
|
||||
environment:
|
||||
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://ollama:11434}
|
||||
- OLLAMA_MODEL=${OLLAMA_MODEL:-qwen2.5:7b}
|
||||
ports:
|
||||
- "8001:8001"
|
||||
depends_on:
|
||||
- ollama
|
||||
networks:
|
||||
- default
|
||||
- shared_services
|
||||
@@ -83,8 +88,29 @@ services:
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
ports:
|
||||
- "11434:11434"
|
||||
environment:
|
||||
- OLLAMA_HOST=0.0.0.0:11434
|
||||
volumes:
|
||||
- ollama_data:/root/.ollama
|
||||
networks:
|
||||
- default
|
||||
- shared_services
|
||||
restart: unless-stopped
|
||||
gpus: all
|
||||
healthcheck:
|
||||
test: ["CMD", "ollama", "list"]
|
||||
interval: 20s
|
||||
timeout: 15s
|
||||
retries: 10
|
||||
start_period: 20s
|
||||
|
||||
volumes:
|
||||
jobtracker_data:
|
||||
ollama_data:
|
||||
|
||||
networks:
|
||||
shared_services:
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import React, { useCallback, useEffect, useMemo, useRef, useState } from "react";
|
||||
|
||||
import { Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material";
|
||||
import { Accordion, AccordionDetails, AccordionSummary, Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material";
|
||||
|
||||
import DeleteOutlineIcon from "@mui/icons-material/DeleteOutline";
|
||||
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
|
||||
import PhotoCameraOutlinedIcon from "@mui/icons-material/PhotoCameraOutlined";
|
||||
|
||||
import { api } from "../api";
|
||||
@@ -399,22 +400,40 @@ export default function ProfilePage() {
|
||||
>
|
||||
{reprocessingCv ? t("profileCvReprocessing") : t("profileCvReprocess")}
|
||||
</Button>
|
||||
<Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
|
||||
{t("profileCopyCvText")}
|
||||
</Button>
|
||||
</Box>
|
||||
</Box>
|
||||
{uploadingCv ? <LinearProgress sx={{ mb: 1.5 }} /> : null}
|
||||
<TextField
|
||||
label={t("profileCvTextLabel")}
|
||||
value={profileCvText}
|
||||
onChange={(e) => setProfileCvText(e.target.value)}
|
||||
helperText={t("profileCvTextHelp")}
|
||||
multiline
|
||||
minRows={12}
|
||||
disabled={!isLocal}
|
||||
fullWidth
|
||||
/>
|
||||
<Alert severity="info" sx={{ mb: 2, borderRadius: 2.5 }}>
|
||||
{t("profileCvStructuredDefaultHint")}
|
||||
</Alert>
|
||||
<Accordion disableGutters elevation={0} sx={{ mb: 2, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper", "&:before": { display: "none" } }}>
|
||||
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
|
||||
<Box sx={{ display: "flex", justifyContent: "space-between", gap: 1.5, alignItems: "center", width: "100%", pr: 1 }}>
|
||||
<Box>
|
||||
<Typography variant="subtitle1" sx={{ fontWeight: 800 }}>{t("profileCvRawPanelTitle")}</Typography>
|
||||
<Typography variant="body2" sx={{ color: "text.secondary" }}>{t("profileCvRawPanelHelp")}</Typography>
|
||||
</Box>
|
||||
<Chip size="small" label={t("profileCvSectionWordCount", { count: cvWordCount })} />
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails>
|
||||
<TextField
|
||||
label={t("profileCvTextLabel")}
|
||||
value={profileCvText}
|
||||
onChange={(e) => setProfileCvText(e.target.value)}
|
||||
helperText={t("profileCvTextHelp")}
|
||||
multiline
|
||||
minRows={12}
|
||||
disabled={!isLocal}
|
||||
fullWidth
|
||||
/>
|
||||
<Box sx={{ mt: 1.5, display: "flex", justifyContent: "flex-end" }}>
|
||||
<Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
|
||||
{t("profileCopyCvText")}
|
||||
</Button>
|
||||
</Box>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
<Box sx={{ mt: 2, p: 1.5, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper" }}>
|
||||
<Box sx={{ display: "flex", justifyContent: "space-between", gap: 2, flexWrap: "wrap", alignItems: "center", mb: 1.5 }}>
|
||||
<Box>
|
||||
|
||||
@@ -147,10 +147,17 @@ test('profile page loads persisted structured cv and can re-parse it', async ()
|
||||
expect(screen.getByText(/extraction history/i)).toBeInTheDocument();
|
||||
expect(screen.getByText(/resume.pdf/i)).toBeInTheDocument();
|
||||
expect(screen.getByText(/current run/i)).toBeInTheDocument();
|
||||
expect(screen.getAllByText(/original extraction/i).length).toBeGreaterThan(0);
|
||||
const originalExtractionToggle = screen.getByRole('button', { name: /original extraction/i });
|
||||
expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'false');
|
||||
expect(screen.getAllByText(/professional summary/i).length).toBeGreaterThan(0);
|
||||
expect(screen.getByLabelText(/full name/i)).toHaveValue('Demo User');
|
||||
expect(screen.getByText(/high 92%/i)).toBeInTheDocument();
|
||||
|
||||
fireEvent.click(originalExtractionToggle);
|
||||
expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'true');
|
||||
expect(await screen.findByLabelText(/profile cv \/ master resume text/i)).toHaveValue('Professional Summary\nBuilt backend systems');
|
||||
|
||||
const analyzeButton = screen.getByRole('button', { name: /analyze sections/i });
|
||||
await waitFor(() => expect(analyzeButton).toBeEnabled());
|
||||
fireEvent.click(analyzeButton);
|
||||
|
||||
Executable
+79
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
MODEL="${OLLAMA_MODEL:-qwen2.5:7b}"
|
||||
OLLAMA_WAIT_SECONDS="${OLLAMA_WAIT_SECONDS:-180}"
|
||||
PULL_WAIT_SECONDS="${OLLAMA_PULL_WAIT_SECONDS:-1800}"
|
||||
|
||||
compose() {
|
||||
docker compose "$@"
|
||||
}
|
||||
|
||||
wait_for_ollama() {
|
||||
local deadline=$((SECONDS + OLLAMA_WAIT_SECONDS))
|
||||
while [ "$SECONDS" -lt "$deadline" ]; do
|
||||
if compose exec -T ollama ollama list >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
sleep 3
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
model_present() {
|
||||
compose exec -T ollama ollama list 2>/dev/null | awk 'NR>1 {print $1}' | grep -Fx "$MODEL" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
wait_for_model() {
|
||||
local deadline=$((SECONDS + PULL_WAIT_SECONDS))
|
||||
while [ "$SECONDS" -lt "$deadline" ]; do
|
||||
if model_present; then
|
||||
return 0
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
echo "Starting Ollama service..."
|
||||
compose up -d ollama
|
||||
|
||||
if ! wait_for_ollama; then
|
||||
echo "Ollama did not become ready within ${OLLAMA_WAIT_SECONDS}s."
|
||||
compose logs --tail=200 ollama || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Ollama is responding."
|
||||
|
||||
if model_present; then
|
||||
echo "Model already present: $MODEL"
|
||||
else
|
||||
echo "Pulling Ollama model: $MODEL"
|
||||
compose exec -T ollama ollama pull "$MODEL" || {
|
||||
echo "Model pull command failed."
|
||||
compose logs --tail=200 ollama || true
|
||||
exit 1
|
||||
}
|
||||
fi
|
||||
|
||||
if ! wait_for_model; then
|
||||
echo "Model ${MODEL} did not appear within ${PULL_WAIT_SECONDS}s."
|
||||
compose exec -T ollama ollama list || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Ollama model ready: $MODEL"
|
||||
|
||||
echo "Restarting AI service so it can use the ready Ollama model."
|
||||
compose up -d ai-service
|
||||
|
||||
if ! compose ps ai-service --format '{{.State}}' 2>/dev/null | head -n 1 | tr '[:upper:]' '[:lower:]' | grep -qx 'running'; then
|
||||
echo "AI service is not running after Ollama warmup."
|
||||
compose logs --tail=200 ai-service || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Ollama warmup complete."
|
||||
@@ -8,6 +8,7 @@ This service runs a local Hugging Face summarization model and also exposes docu
|
||||
- OCR fallback for scanned PDFs
|
||||
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
|
||||
- DOCX / TXT / MD extraction
|
||||
- optional Ollama-backed CV block classification for harder sectioning
|
||||
|
||||
## Install
|
||||
|
||||
@@ -36,8 +37,30 @@ The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can b
|
||||
- `GET /health` — health check and runtime capabilities
|
||||
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
|
||||
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
|
||||
- `POST /cv/classify-block` — JSON body `{ "block": "..." }`, uses Ollama when `OLLAMA_MODEL` is configured
|
||||
|
||||
## Notes
|
||||
- Model weights are downloaded on first run.
|
||||
## Ollama
|
||||
Set these before starting the service if you want the hybrid CV classifier enabled:
|
||||
|
||||
```bash
|
||||
export OLLAMA_BASE_URL=http://ollama:11434
|
||||
export OLLAMA_MODEL=qwen2.5:7b
|
||||
```
|
||||
|
||||
Choose the model by setting `OLLAMA_MODEL` and then warming it with the helper script:
|
||||
|
||||
```bash
|
||||
OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
|
||||
```
|
||||
|
||||
Equivalent manual flow:
|
||||
|
||||
```bash
|
||||
docker compose up -d ollama
|
||||
docker compose exec ollama ollama pull qwen2.5:7b
|
||||
docker compose up -d ai-service
|
||||
```
|
||||
|
||||
- Model weights are downloaded on first pull.
|
||||
- OCR quality depends on scan quality and language support.
|
||||
- Default OCR language is English (`eng`).
|
||||
|
||||
@@ -8,9 +8,13 @@ from docx import Document
|
||||
import fitz
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
import pytesseract
|
||||
from urllib import request as urllib_request
|
||||
from urllib.error import URLError, HTTPError
|
||||
|
||||
app = FastAPI(title="Local AI Service")
|
||||
|
||||
@@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200
|
||||
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
|
||||
OCR_LANGUAGES = "eng"
|
||||
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
|
||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
|
||||
|
||||
|
||||
def _load_runtime():
|
||||
@@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel):
|
||||
top_skills: int = Field(default=8, ge=3, le=12)
|
||||
|
||||
|
||||
class CvClassifyBlockRequest(BaseModel):
|
||||
block: str = Field(min_length=1, max_length=6000)
|
||||
|
||||
|
||||
def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
|
||||
h = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
return f"{h}:{max_length}:{min_length}:{top_skills}"
|
||||
|
||||
|
||||
def _ollama_status():
|
||||
configured = bool(OLLAMA_MODEL)
|
||||
if not configured:
|
||||
return {
|
||||
"ollama_configured": False,
|
||||
"ollama_reachable": False,
|
||||
"ollama_model": None,
|
||||
"ollama_model_available": False,
|
||||
}
|
||||
|
||||
req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET")
|
||||
try:
|
||||
with urllib_request.urlopen(req, timeout=5) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except Exception:
|
||||
return {
|
||||
"ollama_configured": True,
|
||||
"ollama_reachable": False,
|
||||
"ollama_model": OLLAMA_MODEL,
|
||||
"ollama_model_available": False,
|
||||
}
|
||||
|
||||
models = body.get("models") or []
|
||||
names = {item.get("name") for item in models if isinstance(item, dict)}
|
||||
return {
|
||||
"ollama_configured": True,
|
||||
"ollama_reachable": True,
|
||||
"ollama_model": OLLAMA_MODEL,
|
||||
"ollama_model_available": OLLAMA_MODEL in names,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {
|
||||
@@ -59,6 +101,7 @@ async def health():
|
||||
"gpu_name": GPU_NAME,
|
||||
"ocr_available": True,
|
||||
"ocr_languages": OCR_LANGUAGES,
|
||||
**_ollama_status(),
|
||||
}
|
||||
|
||||
|
||||
@@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
|
||||
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
|
||||
|
||||
|
||||
def _ollama_generate_json(prompt: str):
|
||||
if not OLLAMA_MODEL:
|
||||
raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.")
|
||||
|
||||
payload = json.dumps({
|
||||
"model": OLLAMA_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"temperature": 0.1}
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib_request.Request(
|
||||
f"{OLLAMA_BASE_URL}/api/generate",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib_request.urlopen(req, timeout=30) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except HTTPError as ex:
|
||||
raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
|
||||
except URLError as ex:
|
||||
raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.")
|
||||
|
||||
raw = (body.get("response") or "").strip()
|
||||
if not raw:
|
||||
raise HTTPException(status_code=502, detail="Ollama returned an empty response.")
|
||||
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}")
|
||||
if start >= 0 and end > start:
|
||||
return json.loads(raw[start:end + 1])
|
||||
raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")
|
||||
|
||||
|
||||
@app.post("/cv/classify-block")
|
||||
async def classify_cv_block(req: CvClassifyBlockRequest):
|
||||
prompt = f"""
|
||||
You classify one CV text block into structured JSON.
|
||||
Return ONLY valid JSON with this exact shape:
|
||||
{{
|
||||
"section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other",
|
||||
"confidence": 0.0,
|
||||
"reason": "short reason",
|
||||
"title": string|null,
|
||||
"company": string|null,
|
||||
"location": string|null,
|
||||
"start": string|null,
|
||||
"end": string|null,
|
||||
"bullets": string[]
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- Preserve facts only.
|
||||
- section must be one of the listed values.
|
||||
- Use Work Experience only for job/employment blocks.
|
||||
- For Contact blocks, keep title/company/start/end null and bullets empty.
|
||||
- For non-work blocks, title/company/start/end should usually be null.
|
||||
- location must look like a place, not a sentence.
|
||||
- dates must be one of: year, month+year, dd/mm/yyyy, Present, Current.
|
||||
- bullets should only be job tasks/achievements, not titles, companies, dates, or headings.
|
||||
- If unsure, choose Other and keep fields null/empty.
|
||||
|
||||
Block:
|
||||
{req.block.strip()}
|
||||
""".strip()
|
||||
|
||||
parsed = _ollama_generate_json(prompt)
|
||||
return {
|
||||
"section": parsed.get("section") or "Other",
|
||||
"confidence": parsed.get("confidence"),
|
||||
"reason": parsed.get("reason"),
|
||||
"title": parsed.get("title"),
|
||||
"company": parsed.get("company"),
|
||||
"location": parsed.get("location"),
|
||||
"start": parsed.get("start"),
|
||||
"end": parsed.get("end"),
|
||||
"bullets": parsed.get("bullets") or [],
|
||||
}
|
||||
|
||||
|
||||
@app.post("/summarize")
|
||||
async def summarize(req: SummarizeRequest):
|
||||
if req.min_length >= req.max_length:
|
||||
|
||||
Reference in New Issue
Block a user