Improve CV parsing and profile editor flow

This commit is contained in:
2026-03-29 14:29:18 +02:00
parent 99fc94bc18
commit 44000f96f2
18 changed files with 1028 additions and 44 deletions
+3
View File
@@ -9,6 +9,9 @@ GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET
# Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback
GOOGLE_GMAIL_REDIRECT_URI=
AI_SERVICE_BASE_URL=http://ai-service:8001
# Optional: enables hybrid CV block classification in the local AI service.
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_MODEL=qwen2.5:7b
# Optional: only needed if you want the UI to call a non-default API base URL.
# In production the UI defaults to `/api`.
+156 -1
View File
@@ -280,7 +280,7 @@ public sealed class ProfileCvControllerTests
[Fact]
public async Task Upload_populates_structured_fields_from_flattened_cv_when_ai_json_is_invalid()
{
var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1.";
var rawExtraction = "connor.babbington@cesnimda.co.uk cesnimda.co.uk +47 41 33 44 70 E D U C A T I O N E X T E N D E D D I P L O M A N V Q L E V E L 3 I N I C T 2012 - 2015 F O L L O W A B O U T M E Mid-level system developer with eight years of experience in UK local government, with expertise in full-stack development, backend, frontend and server administration. I N T E R E S T S I am interested in PC and board games, as well as cooking and learning new skills. E X P E R I E N C E S Y S T E M D E V E L O P E R 2015 - 2023 Developed and maintained multiple full-stack applications using C#, Python, Ruby on Rails, SQL, and JavaScript. + Warwickshire County Council, UK C O N T A C T Native English speaker, Norwegian level A2/B1, C#, SQL, and public speaking.";
var user = new ApplicationUser { Id = "user-1" };
var userManager = CreateUserManager();
@@ -320,9 +320,164 @@ public sealed class ProfileCvControllerTests
Assert.Contains(structured.Interests, item => item.Contains("board games", StringComparison.OrdinalIgnoreCase) || item.Contains("cooking", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.Equals("English", StringComparison.OrdinalIgnoreCase));
Assert.Contains(structured.Languages, item => item.Name != null && item.Name.StartsWith("Norwegian", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("C#", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Equals("SQL", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Languages, item => item.Name != null && item.Name.Contains("public speaking", StringComparison.OrdinalIgnoreCase));
Assert.DoesNotContain(structured.Sections, section => section.Name == "General");
}
[Fact]
public void Structured_cv_normalization_keeps_human_languages_and_drops_skill_noise()
{
var structured = StructuredCvProfileJson.Deserialize("""
{
"version": "1",
"contact": {},
"summary": [],
"jobs": [],
"education": [],
"skills": [],
"languages": [
{ "name": "English", "level": "Native" },
{ "name": "Native Norwegian speaker", "level": null },
{ "name": "French", "level": null },
{ "name": "C#", "level": "Advanced" },
{ "name": "Leadership", "level": null }
],
"interests": [],
"otherSections": []
}
""");
Assert.Collection(
structured.Languages.OrderBy(item => item.Name, StringComparer.OrdinalIgnoreCase),
first =>
{
Assert.Equal("English", first.Name);
Assert.Equal("Native", first.Level);
},
second =>
{
Assert.Equal("Norwegian", second.Name);
Assert.Equal("Native", second.Level);
});
}
[Fact]
public void Structured_cv_normalization_separates_job_title_company_and_tasks()
{
var structured = StructuredCvProfileJson.Deserialize("""
{
"version": "1",
"contact": {},
"summary": [],
"jobs": [
{
"title": "Acme Ltd",
"company": "Senior Backend Developer",
"location": "Oslo",
"start": "2022",
"end": "2024",
"isCurrent": false,
"bullets": [
"Senior Backend Developer",
"Acme Ltd",
"2022 - 2024",
"Built API integrations for recruiter workflows and reduced manual follow-up churn."
],
"skills": [".NET", "SQL"]
},
{
"title": "Lead Engineer at Northwind Council",
"company": null,
"location": "Remote",
"start": "2020",
"end": "Present",
"isCurrent": true,
"bullets": [
"Led platform delivery across case-management and reporting surfaces.",
"Skills: C#, SQL"
],
"skills": ["C#", "SQL"]
}
],
"education": [],
"skills": [],
"languages": [],
"interests": [],
"otherSections": []
}
""");
Assert.Collection(
structured.Jobs,
first =>
{
Assert.Equal("Senior Backend Developer", first.Title);
Assert.Equal("Acme Ltd", first.Company);
Assert.Equal(new[] { "Built API integrations for recruiter workflows and reduced manual follow-up churn." }, first.Bullets);
},
second =>
{
Assert.Equal("Lead Engineer", second.Title);
Assert.Equal("Northwind Council", second.Company);
Assert.Equal(new[] { "Led platform delivery across case-management and reporting surfaces." }, second.Bullets);
});
}
[Fact]
public void Structured_cv_normalization_hardens_contact_links_locations_and_dates()
{
var structured = StructuredCvProfileJson.Deserialize("""
{
"version": "1",
"contact": {
"location": "Tønsberg, Norway",
"website": "https://cesnimda.co.uk/about",
"linkedin": "linkedin.com/in/demo-user?trk=foo"
},
"summary": [],
"jobs": [
{
"title": "System Developer",
"company": "Warwickshire County Council",
"location": "Warwickshire, England, UK",
"start": "Sept 2023",
"end": "1/1/2024",
"isCurrent": false,
"bullets": ["Built APIs"],
"skills": []
},
{
"title": "Developer",
"company": "Demo Co",
"location": "Remote 123",
"start": "Spring 2024",
"end": "Later",
"isCurrent": false,
"bullets": ["Kept services running"],
"skills": []
}
],
"education": [],
"skills": [],
"languages": [],
"interests": [],
"otherSections": []
}
""");
Assert.Equal("Tønsberg, Norway", structured.Contact.Location);
Assert.Equal("cesnimda.co.uk", structured.Contact.Website);
Assert.Equal("https://www.linkedin.com/in/demo-user", structured.Contact.LinkedIn);
Assert.Equal("Warwickshire, England, UK", structured.Jobs[0].Location);
Assert.Equal("Sept 2023", structured.Jobs[0].Start);
Assert.Equal("1/1/2024", structured.Jobs[0].End);
Assert.Null(structured.Jobs[1].Location);
Assert.Null(structured.Jobs[1].Start);
Assert.Null(structured.Jobs[1].End);
}
[Fact]
public async Task Parse_returns_structured_cv_and_persists_it()
{
@@ -124,6 +124,10 @@ public sealed class AdminSystemController : ControllerBase
GpuName: null,
OcrAvailable: false,
OcrLanguages: null,
OllamaConfigured: null,
OllamaReachable: null,
OllamaModel: null,
OllamaModelAvailable: null,
HealthLatencyMs: null,
ProbeLatencyMs: null,
LastProbeAt: null,
@@ -61,13 +61,15 @@ public sealed class ProfileCvController : ControllerBase
private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
private readonly ICvAiClassifier _cvAiClassifier;
private readonly JobTrackerContext _db;
private readonly AppPaths _paths;
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths)
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths, ICvAiClassifier? cvAiClassifier = null)
{
_users = users;
_aiService = aiService;
_cvAiClassifier = cvAiClassifier ?? NoOpCvAiClassifier.Instance;
_db = db;
_paths = paths;
}
@@ -338,14 +340,7 @@ public sealed class ProfileCvController : ControllerBase
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var parseSource = NormalizeTextForStructuredParsing(text);
var fallbackSections = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var fallbackSections = await BuildFallbackSectionsAsync(parseSource, cancellationToken);
var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
AnnotateStructuredCv(sectionFallback, "repair", 0.56);
@@ -729,12 +724,19 @@ public sealed class ProfileCvController : ControllerBase
private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
{
var languages = new List<StructuredCvLanguage>();
foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
var candidates = Regex.Split(content.Replace("\r\n", "\n"), @"[\n,;]+|(?<=[.!?])\s+")
.Select(item => item.Trim())
.Where(item => item.Length > 1);
foreach (var candidate in candidates)
{
var name = NullIfWhitespace(match.Groups[1].Value);
var level = NullIfWhitespace(match.Groups[2].Value);
if (name is null) continue;
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
var level = HumanLanguageCatalog.ExtractLevel(candidate);
if (level is null) continue;
foreach (var name in HumanLanguageCatalog.ExtractLanguageNames(candidate))
{
languages.Add(new StructuredCvLanguage { Name = name, Level = level });
}
}
return languages
@@ -872,6 +874,86 @@ public sealed class ProfileCvController : ControllerBase
.ToList();
}
private async Task<List<StructuredCvSection>> BuildFallbackSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var parsed = ParseSections(parseSource)
.Select(section => new StructuredCvSection
{
Name = section.Name,
Content = section.Content,
WordCount = CountWords(section.Content),
})
.ToList();
var hasRealSections = parsed.Any(section => !string.Equals(section.Name, "General", StringComparison.OrdinalIgnoreCase));
if (hasRealSections) return parsed;
var aiSections = await ClassifyBlocksIntoSectionsAsync(parseSource, cancellationToken);
return aiSections.Count > 0 ? aiSections : parsed;
}
private async Task<List<StructuredCvSection>> ClassifyBlocksIntoSectionsAsync(string parseSource, CancellationToken cancellationToken)
{
var blocks = Regex.Split(parseSource.Replace("\r\n", "\n"), @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => block.Length >= 24)
.ToList();
if (blocks.Count == 0) return new List<StructuredCvSection>();
var sectionBuckets = new List<StructuredCvSection>();
foreach (var block in blocks)
{
var classification = await _cvAiClassifier.ClassifyBlockAsync(block, cancellationToken);
var sectionName = classification?.Section;
if (!string.IsNullOrWhiteSpace(sectionName) && SectionAliases.TryGetValue(sectionName, out var canonical))
{
sectionName = canonical;
}
if (string.IsNullOrWhiteSpace(sectionName) || string.Equals(sectionName, "Other", StringComparison.OrdinalIgnoreCase))
{
sectionName = "General";
}
var content = block;
if (string.Equals(sectionName, "Work Experience", StringComparison.OrdinalIgnoreCase) && classification is not null)
{
var lines = new List<string>();
if (!string.IsNullOrWhiteSpace(classification.Title)) lines.Add($"### {classification.Title.Trim()}");
var endIsCurrent = string.Equals(classification.End, "Present", StringComparison.OrdinalIgnoreCase) || string.Equals(classification.End, "Current", StringComparison.OrdinalIgnoreCase);
var dateRange = FormatDateRangeForSection(classification.Start, classification.End, endIsCurrent);
var meta = string.Join(" | ", new[] { classification.Company, classification.Location, dateRange }.Where(value => !string.IsNullOrWhiteSpace(value)));
if (!string.IsNullOrWhiteSpace(meta)) lines.Add(meta);
if (classification.Bullets is not null)
{
lines.AddRange(classification.Bullets.Where(bullet => !string.IsNullOrWhiteSpace(bullet)).Select(bullet => $"- {bullet.Trim()}"));
}
if (lines.Count > 0) content = string.Join("\n", lines);
}
var existing = sectionBuckets.FirstOrDefault(section => section.Name == sectionName);
if (existing is null)
{
sectionBuckets.Add(new StructuredCvSection { Name = sectionName, Content = content, WordCount = CountWords(content) });
}
else
{
existing.Content = $"{existing.Content}\n\n{content}".Trim();
existing.WordCount = CountWords(existing.Content);
}
}
return sectionBuckets.Where(section => !string.IsNullOrWhiteSpace(section.Content)).ToList();
}
private static string? FormatDateRangeForSection(string? start, string? end, bool isCurrent)
{
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
if (string.IsNullOrWhiteSpace(start)) return end;
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
}
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var normalized = text.Trim();
+1
View File
@@ -132,6 +132,7 @@ builder.Services.AddHttpClient("ai-service", client =>
builder.Services.AddMemoryCache();
builder.Services.AddSingleton<ISummarizerService, SummarizerService>();
builder.Services.AddSingleton<ICvAiClassifier, CvAiClassifier>();
builder.Services.AddSingleton<IGoogleTokenValidator, GoogleTokenValidator>();
builder.Services.AddScoped<IGmailOAuthService, GmailOAuthService>();
+65
View File
@@ -0,0 +1,65 @@
using System.Net.Http;
using System.Text;
using System.Text.Json;
namespace JobTrackerApi.Services;
public sealed record CvBlockClassificationResult(
string? Section,
double? Confidence,
string? Reason,
string? Title,
string? Company,
string? Location,
string? Start,
string? End,
List<string>? Bullets);
public interface ICvAiClassifier
{
Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default);
}
public sealed class CvAiClassifier : ICvAiClassifier
{
private readonly IHttpClientFactory _httpClientFactory;
public CvAiClassifier(IHttpClientFactory httpClientFactory)
{
_httpClientFactory = httpClientFactory;
}
public async Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(block)) return null;
try
{
var client = _httpClientFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { block });
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
using var response = await client.PostAsync("/cv/classify-block", content, cancellationToken);
if (!response.IsSuccessStatusCode) return null;
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
var parsed = await JsonSerializer.DeserializeAsync<CvBlockClassificationResult>(stream, new JsonSerializerOptions(JsonSerializerDefaults.Web)
{
PropertyNameCaseInsensitive = true
}, cancellationToken);
return parsed;
}
catch
{
return null;
}
}
}
public sealed class NoOpCvAiClassifier : ICvAiClassifier
{
public static NoOpCvAiClassifier Instance { get; } = new();
private NoOpCvAiClassifier() { }
public Task<CvBlockClassificationResult?> ClassifyBlockAsync(string block, CancellationToken cancellationToken = default)
=> Task.FromResult<CvBlockClassificationResult?>(null);
}
@@ -21,6 +21,10 @@ namespace JobTrackerApi.Services
string? GpuName,
bool? OcrAvailable,
string? OcrLanguages,
bool? OllamaConfigured,
bool? OllamaReachable,
string? OllamaModel,
bool? OllamaModelAvailable,
double? HealthLatencyMs,
double? ProbeLatencyMs,
DateTimeOffset? LastProbeAt,
@@ -310,6 +314,10 @@ namespace JobTrackerApi.Services
string? gpuName = null;
bool? ocrAvailable = null;
string? ocrLanguages = null;
bool? ollamaConfigured = null;
bool? ollamaReachable = null;
string? ollamaModel = null;
bool? ollamaModelAvailable = null;
double? healthLatencyMs = null;
var healthy = false;
string? healthError = null;
@@ -332,6 +340,10 @@ namespace JobTrackerApi.Services
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
if (doc.RootElement.TryGetProperty("ollama_configured", out var ollamaConfiguredEl) && ollamaConfiguredEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaConfigured = ollamaConfiguredEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ollama_reachable", out var ollamaReachableEl) && ollamaReachableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaReachable = ollamaReachableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ollama_model", out var ollamaModelEl)) ollamaModel = ollamaModelEl.GetString();
if (doc.RootElement.TryGetProperty("ollama_model_available", out var ollamaModelAvailableEl) && ollamaModelAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ollamaModelAvailable = ollamaModelAvailableEl.GetBoolean();
}
else
{
@@ -390,6 +402,10 @@ namespace JobTrackerApi.Services
GpuName: gpuName,
OcrAvailable: ocrAvailable,
OcrLanguages: ocrLanguages,
OllamaConfigured: ollamaConfigured,
OllamaReachable: ollamaReachable,
OllamaModel: ollamaModel,
OllamaModelAvailable: ollamaModelAvailable,
HealthLatencyMs: healthLatencyMs,
ProbeLatencyMs: probeLatencyMs,
LastProbeAt: lastProbeAt,
+162
View File
@@ -0,0 +1,162 @@
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;
namespace JobTrackerApi.Models;
public static class HumanLanguageCatalog
{
private static readonly Dictionary<string, string> LanguageLookup = BuildLanguageLookup();
private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled);
private static readonly Regex LevelRegex = new(
@"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
public static string? NormalizeLanguageName(string? raw)
{
var matches = ExtractLanguageNames(raw);
return matches.Count == 1 ? matches[0] : null;
}
public static IReadOnlyList<string> ExtractLanguageNames(string? raw)
{
if (string.IsNullOrWhiteSpace(raw)) return Array.Empty<string>();
var words = WordRegex.Matches(raw)
.Select(match => match.Value)
.Where(value => !string.IsNullOrWhiteSpace(value))
.ToList();
if (words.Count == 0) return Array.Empty<string>();
var matches = new List<(int Start, int Size, string Canonical)>();
for (var size = Math.Min(4, words.Count); size >= 1; size--)
{
for (var start = 0; start <= words.Count - size; start++)
{
var phrase = string.Join(" ", words.Skip(start).Take(size));
if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue;
if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue;
matches.Add((start, size, canonical));
}
}
return matches
.OrderBy(match => match.Start)
.Select(match => match.Canonical)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
public static bool HasRecognizedLevel(string? raw)
{
return ExtractLevel(raw) is not null;
}
public static string? ExtractLevel(string? raw)
{
if (string.IsNullOrWhiteSpace(raw)) return null;
var match = LevelRegex.Match(raw);
if (!match.Success) return null;
var value = match.Groups[1].Value.Trim();
var compact = Regex.Replace(value, @"\s+", " ");
return compact.ToLowerInvariant() switch
{
"native speaker" => "Native",
"native" => "Native",
"fluent" => "Fluent",
"advanced" => "Advanced",
"intermediate" => "Intermediate",
"beginner" => "Beginner",
"basic" => "Basic",
"conversational" => "Conversational",
"elementary" => "Elementary",
"professional working proficiency" => "Professional working proficiency",
"working proficiency" => "Working proficiency",
"limited working proficiency" => "Limited working proficiency",
"full professional proficiency" => "Full professional proficiency",
_ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty),
_ => compact,
};
}
private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB)
{
var endA = startA + sizeA;
var endB = startB + sizeB;
return startA < endB && startB < endA;
}
private static Dictionary<string, string> BuildLanguageLookup()
{
var map = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
void Add(string? alias, string? canonical)
{
var normalizedAlias = NormalizeKey(alias);
var normalizedCanonical = NormalizeDisplayName(canonical);
if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return;
map.TryAdd(normalizedAlias, normalizedCanonical);
}
foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures))
{
var english = CleanCultureLanguageName(culture.EnglishName);
var native = CleanCultureLanguageName(culture.NativeName);
Add(english, english);
Add(native, english);
}
Add("norsk", "Norwegian");
Add("bokmal", "Norwegian");
Add("bokmål", "Norwegian");
Add("nynorsk", "Norwegian");
Add("mandarin", "Chinese");
Add("cantonese", "Chinese");
Add("farsi", "Persian");
Add("persian", "Persian");
return map;
}
private static string? CleanCultureLanguageName(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
var cleaned = value.Trim();
var parenIndex = cleaned.IndexOf('(');
if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim();
var commaIndex = cleaned.IndexOf(',');
if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim();
return NormalizeDisplayName(cleaned);
}
private static string? NormalizeDisplayName(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
var cleaned = Regex.Replace(value.Trim(), @"\s+", " ");
return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
? word
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()));
}
private static string NormalizeKey(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
var decomposed = value.Trim().Normalize(NormalizationForm.FormD);
var builder = new StringBuilder(decomposed.Length);
foreach (var ch in decomposed)
{
if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue;
builder.Append(char.ToLowerInvariant(ch));
}
return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim();
}
}
+207 -10
View File
@@ -144,7 +144,7 @@ public static class StructuredCvProfileJson
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
profile.Metadata ??= new StructuredCvMetadata();
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
profile.Contact ??= new StructuredCvContact();
profile.Contact = NormalizeContact(profile.Contact);
profile.Summary = CleanList(profile.Summary);
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
.Select(NormalizeJob)
@@ -178,20 +178,206 @@ public static class StructuredCvProfileJson
return profile;
}
private static StructuredCvContact NormalizeContact(StructuredCvContact? contact)
{
contact ??= new StructuredCvContact();
contact.FullName = TrimOrNull(contact.FullName);
contact.Headline = TrimOrNull(contact.Headline);
contact.Email = TrimOrNull(contact.Email);
contact.Phone = TrimOrNull(contact.Phone);
contact.Location = NormalizeLocationValue(contact.Location);
contact.Website = NormalizeWebsite(contact.Website);
contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn);
return contact;
}
private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
{
job ??= new StructuredCvJob();
job.Title = TrimOrNull(job.Title);
job.Company = TrimOrNull(job.Company);
job.Location = TrimOrNull(job.Location);
job.Start = TrimOrNull(job.Start);
job.End = TrimOrNull(job.End);
job.Bullets = CleanList(job.Bullets);
var title = NormalizeJobTitle(job.Title);
var company = NormalizeCompanyName(job.Company);
var location = NormalizeLocationValue(job.Location);
if (!string.IsNullOrWhiteSpace(title) && company is null)
{
var atSplit = Regex.Match(title, @"^(?<title>.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase);
if (atSplit.Success)
{
title = NormalizeJobTitle(atSplit.Groups["title"].Value);
company = NormalizeCompanyName(atSplit.Groups["company"].Value);
}
}
if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company))
{
var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title);
var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company);
if (titleLooksLikeCompany && companyLooksLikeTitle)
{
(title, company) = (company, title);
}
}
if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title))
{
if (company is null) company = title;
title = null;
}
if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null)
{
title = company;
company = null;
}
job.Title = title;
job.Company = company;
job.Location = location;
job.Start = NormalizeDateValue(job.Start);
job.End = NormalizeDateValue(job.End);
job.Bullets = CleanList(job.Bullets)
.Select(NormalizeBullet)
.Where(bullet => bullet is not null)
.Select(bullet => bullet!)
.Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company))
.ToList();
job.Skills = CleanList(job.Skills);
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
return job;
}
private static string? NormalizeBullet(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
return value.Trim().TrimStart('-', '•', '*', ' ');
}
private static bool IsUsefulJobBullet(string? value, string? title, string? company)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return false;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false;
if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false;
if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false;
if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false;
return true;
}
private static string? NormalizeJobTitle(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static string? NormalizeCompanyName(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null;
if (trimmed.Contains('.') && trimmed.Contains(' ')) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static string? NormalizeLocationValue(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
if (parts.Length == 0 || parts.Length > 4) return null;
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'\-. ]+$"))) return null;
return string.Join(", ", parts);
}
private static string? NormalizeWebsite(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
var candidate = trimmed;
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
var host = uri.Host.Trim().Trim('.').ToLowerInvariant();
if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null;
return host;
}
private static string? NormalizeLinkedIn(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
var candidate = trimmed;
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
var path = uri.AbsolutePath.TrimEnd('/');
if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null;
return $"https://www.linkedin.com{path}";
}
private static string? NormalizeDateValue(string? value)
{
var trimmed = TrimOrNull(value);
return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null;
}
private static bool LooksLikeDateRange(string value)
{
return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
}
private static bool LooksLikeUrlOrEmail(string value)
{
return value.Contains('@')
|| value.Contains("www.", StringComparison.OrdinalIgnoreCase)
|| value.Contains("http://", StringComparison.OrdinalIgnoreCase)
|| value.Contains("https://", StringComparison.OrdinalIgnoreCase);
}
private static bool LooksLikeSectionHeading(string value)
{
return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Experience", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Employment History", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Education", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Skills", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Languages", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Interests", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Contact", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Summary", StringComparison.OrdinalIgnoreCase);
}
private static bool LooksLikeJobTitle(string value)
{
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase)
|| (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value));
}
private static bool LooksLikeCompanyName(string value)
{
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase)
|| value.Contains('&')
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
}
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
{
education ??= new StructuredCvEducation();
@@ -207,8 +393,13 @@ public static class StructuredCvProfileJson
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
{
language ??= new StructuredCvLanguage();
language.Name = TrimOrNull(language.Name);
language.Level = TrimOrNull(language.Level);
var originalName = TrimOrNull(language.Name);
var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName);
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName);
language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null;
language.Level = normalizedLevel;
language.Notes = TrimOrNull(language.Notes);
return language;
}
@@ -360,7 +551,13 @@ public static class StructuredCvProfileJson
}
}
return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes };
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
return new StructuredCvLanguage
{
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
Level = normalizedLevel,
Notes = notes,
};
})
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList();
+8 -2
View File
@@ -25,7 +25,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
## Quickstart (Docker)
This runs: frontend (nginx), backend API, and the AI service.
This runs: frontend (nginx), backend API, the local AI service, and an Ollama container for hybrid CV block classification.
1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`).
@@ -108,9 +108,15 @@ The API calls a local FastAPI service to generate summaries. If its not runni
With Docker (recommended):
```bash
docker compose up --build ai-service
# One command for local Ollama startup + pull + AI-service restart
OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
# Then start the rest of the app if needed
docker compose up --build -d backend frontend
```
The first Ollama startup is usually quick, but the first model pull and first generation can take a while. After the model is cached in the `ollama_data` volume, later restarts are much faster.
Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`).
## Configuration
+5 -1
View File
@@ -52,6 +52,8 @@ AUTH_ADMIN_EMAIL=you@example.com
AUTH_ADMIN_PASSWORD=replace_with_strong_password
APP_PUBLIC_BASE_URL=https://your-domain.example
AI_SERVICE_BASE_URL=http://ai-service:8001
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_MODEL=qwen2.5:7b
EMAIL_FOLLOWUPREMINDERS_ENABLED=true
EMAIL_FOLLOWUPREMINDERS_UPCOMINGDAYS=2
# Optional backward-compatible alias if older config still references the previous name:
@@ -87,7 +89,8 @@ If this app is going to be a real production service on Ubuntu:
2. Gitea Actions runs tests
3. if green, workflow uploads repo to server
4. `deploy/deploy.sh` links `/opt/job-tracker/shared/.env` into the repo checkout, then runs `docker compose build && docker compose up -d`
5. workflow checks service status after deployment
5. if `OLLAMA_MODEL` is set, the deploy script waits for Ollama, pulls the configured model if missing, then restarts `ai-service` so hybrid CV classification can use it
6. workflow checks service status after deployment
## Post-deploy verification you should also do manually the first time
- confirm reverse proxy routes to the frontend correctly
@@ -96,3 +99,4 @@ If this app is going to be a real production service on Ubuntu:
- confirm AI service container is reachable from backend
- confirm reminder and admin/system pages load
- verify follow-up reminder emails are enabled only when intended and that links open the correct job/tab
hat links open the correct job/tab
+5
View File
@@ -45,6 +45,11 @@ build_with_recovery
# Force recreation so updated port mappings, env vars, and container config always apply on deploy.
compose up -d --force-recreate --remove-orphans
if [ -n "${OLLAMA_MODEL:-}" ]; then
echo "Post-deploy Ollama warmup enabled for model: ${OLLAMA_MODEL}"
./scripts/start-ollama-cv.sh
fi
sleep 5
compose ps
+26
View File
@@ -71,8 +71,13 @@ services:
build:
context: ./tools/summarizer
dockerfile: Dockerfile
environment:
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://ollama:11434}
- OLLAMA_MODEL=${OLLAMA_MODEL:-qwen2.5:7b}
ports:
- "8001:8001"
depends_on:
- ollama
networks:
- default
- shared_services
@@ -83,8 +88,29 @@ services:
timeout: 10s
retries: 3
ollama:
image: ollama/ollama:latest
ports:
- "11434:11434"
environment:
- OLLAMA_HOST=0.0.0.0:11434
volumes:
- ollama_data:/root/.ollama
networks:
- default
- shared_services
restart: unless-stopped
gpus: all
healthcheck:
test: ["CMD", "ollama", "list"]
interval: 20s
timeout: 15s
retries: 10
start_period: 20s
volumes:
jobtracker_data:
ollama_data:
networks:
shared_services:
+33 -14
View File
@@ -1,8 +1,9 @@
import React, { useCallback, useEffect, useMemo, useRef, useState } from "react";
import { Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material";
import { Accordion, AccordionDetails, AccordionSummary, Alert, Avatar, Box, Button, Chip, Divider, FormControl, InputLabel, LinearProgress, MenuItem, Paper, Select, TextField, Typography } from "@mui/material";
import DeleteOutlineIcon from "@mui/icons-material/DeleteOutline";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import PhotoCameraOutlinedIcon from "@mui/icons-material/PhotoCameraOutlined";
import { api } from "../api";
@@ -399,22 +400,40 @@ export default function ProfilePage() {
>
{reprocessingCv ? t("profileCvReprocessing") : t("profileCvReprocess")}
</Button>
<Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
{t("profileCopyCvText")}
</Button>
</Box>
</Box>
{uploadingCv ? <LinearProgress sx={{ mb: 1.5 }} /> : null}
<TextField
label={t("profileCvTextLabel")}
value={profileCvText}
onChange={(e) => setProfileCvText(e.target.value)}
helperText={t("profileCvTextHelp")}
multiline
minRows={12}
disabled={!isLocal}
fullWidth
/>
<Alert severity="info" sx={{ mb: 2, borderRadius: 2.5 }}>
{t("profileCvStructuredDefaultHint")}
</Alert>
<Accordion disableGutters elevation={0} sx={{ mb: 2, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper", "&:before": { display: "none" } }}>
<AccordionSummary expandIcon={<ExpandMoreIcon />}>
<Box sx={{ display: "flex", justifyContent: "space-between", gap: 1.5, alignItems: "center", width: "100%", pr: 1 }}>
<Box>
<Typography variant="subtitle1" sx={{ fontWeight: 800 }}>{t("profileCvRawPanelTitle")}</Typography>
<Typography variant="body2" sx={{ color: "text.secondary" }}>{t("profileCvRawPanelHelp")}</Typography>
</Box>
<Chip size="small" label={t("profileCvSectionWordCount", { count: cvWordCount })} />
</Box>
</AccordionSummary>
<AccordionDetails>
<TextField
label={t("profileCvTextLabel")}
value={profileCvText}
onChange={(e) => setProfileCvText(e.target.value)}
helperText={t("profileCvTextHelp")}
multiline
minRows={12}
disabled={!isLocal}
fullWidth
/>
<Box sx={{ mt: 1.5, display: "flex", justifyContent: "flex-end" }}>
<Button variant="text" disabled={!profileCvText.trim()} onClick={() => navigator.clipboard.writeText(profileCvText)}>
{t("profileCopyCvText")}
</Button>
</Box>
</AccordionDetails>
</Accordion>
<Box sx={{ mt: 2, p: 1.5, borderRadius: 3, border: "1px solid", borderColor: "divider", backgroundColor: "background.paper" }}>
<Box sx={{ display: "flex", justifyContent: "space-between", gap: 2, flexWrap: "wrap", alignItems: "center", mb: 1.5 }}>
<Box>
+7
View File
@@ -147,10 +147,17 @@ test('profile page loads persisted structured cv and can re-parse it', async ()
expect(screen.getByText(/extraction history/i)).toBeInTheDocument();
expect(screen.getByText(/resume.pdf/i)).toBeInTheDocument();
expect(screen.getByText(/current run/i)).toBeInTheDocument();
expect(screen.getAllByText(/original extraction/i).length).toBeGreaterThan(0);
const originalExtractionToggle = screen.getByRole('button', { name: /original extraction/i });
expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'false');
expect(screen.getAllByText(/professional summary/i).length).toBeGreaterThan(0);
expect(screen.getByLabelText(/full name/i)).toHaveValue('Demo User');
expect(screen.getByText(/high 92%/i)).toBeInTheDocument();
fireEvent.click(originalExtractionToggle);
expect(originalExtractionToggle).toHaveAttribute('aria-expanded', 'true');
expect(await screen.findByLabelText(/profile cv \/ master resume text/i)).toHaveValue('Professional Summary\nBuilt backend systems');
const analyzeButton = screen.getByRole('button', { name: /analyze sections/i });
await waitFor(() => expect(analyzeButton).toBeEnabled());
fireEvent.click(analyzeButton);
+79
View File
@@ -0,0 +1,79 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
MODEL="${OLLAMA_MODEL:-qwen2.5:7b}"
OLLAMA_WAIT_SECONDS="${OLLAMA_WAIT_SECONDS:-180}"
PULL_WAIT_SECONDS="${OLLAMA_PULL_WAIT_SECONDS:-1800}"
compose() {
docker compose "$@"
}
wait_for_ollama() {
local deadline=$((SECONDS + OLLAMA_WAIT_SECONDS))
while [ "$SECONDS" -lt "$deadline" ]; do
if compose exec -T ollama ollama list >/dev/null 2>&1; then
return 0
fi
sleep 3
done
return 1
}
model_present() {
compose exec -T ollama ollama list 2>/dev/null | awk 'NR>1 {print $1}' | grep -Fx "$MODEL" >/dev/null 2>&1
}
wait_for_model() {
local deadline=$((SECONDS + PULL_WAIT_SECONDS))
while [ "$SECONDS" -lt "$deadline" ]; do
if model_present; then
return 0
fi
sleep 5
done
return 1
}
echo "Starting Ollama service..."
compose up -d ollama
if ! wait_for_ollama; then
echo "Ollama did not become ready within ${OLLAMA_WAIT_SECONDS}s."
compose logs --tail=200 ollama || true
exit 1
fi
echo "Ollama is responding."
if model_present; then
echo "Model already present: $MODEL"
else
echo "Pulling Ollama model: $MODEL"
compose exec -T ollama ollama pull "$MODEL" || {
echo "Model pull command failed."
compose logs --tail=200 ollama || true
exit 1
}
fi
if ! wait_for_model; then
echo "Model ${MODEL} did not appear within ${PULL_WAIT_SECONDS}s."
compose exec -T ollama ollama list || true
exit 1
fi
echo "Ollama model ready: $MODEL"
echo "Restarting AI service so it can use the ready Ollama model."
compose up -d ai-service
if ! compose ps ai-service --format '{{.State}}' 2>/dev/null | head -n 1 | tr '[:upper:]' '[:lower:]' | grep -qx 'running'; then
echo "AI service is not running after Ollama warmup."
compose logs --tail=200 ai-service || true
exit 1
fi
echo "Ollama warmup complete."
+25 -2
View File
@@ -8,6 +8,7 @@ This service runs a local Hugging Face summarization model and also exposes docu
- OCR fallback for scanned PDFs
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
- DOCX / TXT / MD extraction
- optional Ollama-backed CV block classification for harder sectioning
## Install
@@ -36,8 +37,30 @@ The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can b
- `GET /health` — health check and runtime capabilities
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
- `POST /cv/classify-block` — JSON body `{ "block": "..." }`, uses Ollama when `OLLAMA_MODEL` is configured
## Notes
- Model weights are downloaded on first run.
## Ollama
Set these before starting the service if you want the hybrid CV classifier enabled:
```bash
export OLLAMA_BASE_URL=http://ollama:11434
export OLLAMA_MODEL=qwen2.5:7b
```
Choose the model by setting `OLLAMA_MODEL` and then warming it with the helper script:
```bash
OLLAMA_MODEL=qwen2.5:7b ./scripts/start-ollama-cv.sh
```
Equivalent manual flow:
```bash
docker compose up -d ollama
docker compose exec ollama ollama pull qwen2.5:7b
docker compose up -d ai-service
```
- Model weights are downloaded on first pull.
- OCR quality depends on scan quality and language support.
- Default OCR language is English (`eng`).
+130
View File
@@ -8,9 +8,13 @@ from docx import Document
import fitz
import hashlib
import io
import json
import os
import re
import torch
import pytesseract
from urllib import request as urllib_request
from urllib.error import URLError, HTTPError
app = FastAPI(title="Local AI Service")
@@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
OCR_LANGUAGES = "eng"
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
def _load_runtime():
@@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel):
top_skills: int = Field(default=8, ge=3, le=12)
class CvClassifyBlockRequest(BaseModel):
block: str = Field(min_length=1, max_length=6000)
def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
h = hashlib.sha256(text.encode("utf-8")).hexdigest()
return f"{h}:{max_length}:{min_length}:{top_skills}"
def _ollama_status():
configured = bool(OLLAMA_MODEL)
if not configured:
return {
"ollama_configured": False,
"ollama_reachable": False,
"ollama_model": None,
"ollama_model_available": False,
}
req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET")
try:
with urllib_request.urlopen(req, timeout=5) as response:
body = json.loads(response.read().decode("utf-8"))
except Exception:
return {
"ollama_configured": True,
"ollama_reachable": False,
"ollama_model": OLLAMA_MODEL,
"ollama_model_available": False,
}
models = body.get("models") or []
names = {item.get("name") for item in models if isinstance(item, dict)}
return {
"ollama_configured": True,
"ollama_reachable": True,
"ollama_model": OLLAMA_MODEL,
"ollama_model_available": OLLAMA_MODEL in names,
}
@app.get("/health")
async def health():
return {
@@ -59,6 +101,7 @@ async def health():
"gpu_name": GPU_NAME,
"ocr_available": True,
"ocr_languages": OCR_LANGUAGES,
**_ollama_status(),
}
@@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
def _ollama_generate_json(prompt: str):
if not OLLAMA_MODEL:
raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.")
payload = json.dumps({
"model": OLLAMA_MODEL,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {"temperature": 0.1}
}).encode("utf-8")
req = urllib_request.Request(
f"{OLLAMA_BASE_URL}/api/generate",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib_request.urlopen(req, timeout=30) as response:
body = json.loads(response.read().decode("utf-8"))
except HTTPError as ex:
raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
except URLError as ex:
raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.")
raw = (body.get("response") or "").strip()
if not raw:
raise HTTPException(status_code=502, detail="Ollama returned an empty response.")
try:
return json.loads(raw)
except json.JSONDecodeError:
start = raw.find("{")
end = raw.rfind("}")
if start >= 0 and end > start:
return json.loads(raw[start:end + 1])
raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")
@app.post("/cv/classify-block")
async def classify_cv_block(req: CvClassifyBlockRequest):
prompt = f"""
You classify one CV text block into structured JSON.
Return ONLY valid JSON with this exact shape:
{{
"section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other",
"confidence": 0.0,
"reason": "short reason",
"title": string|null,
"company": string|null,
"location": string|null,
"start": string|null,
"end": string|null,
"bullets": string[]
}}
Rules:
- Preserve facts only.
- section must be one of the listed values.
- Use Work Experience only for job/employment blocks.
- For Contact blocks, keep title/company/start/end null and bullets empty.
- For non-work blocks, title/company/start/end should usually be null.
- location must look like a place, not a sentence.
- dates must be one of: year, month+year, dd/mm/yyyy, Present, Current.
- bullets should only be job tasks/achievements, not titles, companies, dates, or headings.
- If unsure, choose Other and keep fields null/empty.
Block:
{req.block.strip()}
""".strip()
parsed = _ollama_generate_json(prompt)
return {
"section": parsed.get("section") or "Other",
"confidence": parsed.get("confidence"),
"reason": parsed.get("reason"),
"title": parsed.get("title"),
"company": parsed.get("company"),
"location": parsed.get("location"),
"start": parsed.get("start"),
"end": parsed.get("end"),
"bullets": parsed.get("bullets") or [],
}
@app.post("/summarize")
async def summarize(req: SummarizeRequest):
if req.min_length >= req.max_length: