Improve CV parsing and profile editor flow
This commit is contained in:
@@ -0,0 +1,162 @@
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace JobTrackerApi.Models;
|
||||
|
||||
public static class HumanLanguageCatalog
|
||||
{
|
||||
private static readonly Dictionary<string, string> LanguageLookup = BuildLanguageLookup();
|
||||
|
||||
private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled);
|
||||
|
||||
private static readonly Regex LevelRegex = new(
|
||||
@"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
public static string? NormalizeLanguageName(string? raw)
|
||||
{
|
||||
var matches = ExtractLanguageNames(raw);
|
||||
return matches.Count == 1 ? matches[0] : null;
|
||||
}
|
||||
|
||||
public static IReadOnlyList<string> ExtractLanguageNames(string? raw)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(raw)) return Array.Empty<string>();
|
||||
|
||||
var words = WordRegex.Matches(raw)
|
||||
.Select(match => match.Value)
|
||||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||||
.ToList();
|
||||
|
||||
if (words.Count == 0) return Array.Empty<string>();
|
||||
|
||||
var matches = new List<(int Start, int Size, string Canonical)>();
|
||||
for (var size = Math.Min(4, words.Count); size >= 1; size--)
|
||||
{
|
||||
for (var start = 0; start <= words.Count - size; start++)
|
||||
{
|
||||
var phrase = string.Join(" ", words.Skip(start).Take(size));
|
||||
if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue;
|
||||
if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue;
|
||||
matches.Add((start, size, canonical));
|
||||
}
|
||||
}
|
||||
|
||||
return matches
|
||||
.OrderBy(match => match.Start)
|
||||
.Select(match => match.Canonical)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public static bool HasRecognizedLevel(string? raw)
|
||||
{
|
||||
return ExtractLevel(raw) is not null;
|
||||
}
|
||||
|
||||
public static string? ExtractLevel(string? raw)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(raw)) return null;
|
||||
|
||||
var match = LevelRegex.Match(raw);
|
||||
if (!match.Success) return null;
|
||||
|
||||
var value = match.Groups[1].Value.Trim();
|
||||
var compact = Regex.Replace(value, @"\s+", " ");
|
||||
return compact.ToLowerInvariant() switch
|
||||
{
|
||||
"native speaker" => "Native",
|
||||
"native" => "Native",
|
||||
"fluent" => "Fluent",
|
||||
"advanced" => "Advanced",
|
||||
"intermediate" => "Intermediate",
|
||||
"beginner" => "Beginner",
|
||||
"basic" => "Basic",
|
||||
"conversational" => "Conversational",
|
||||
"elementary" => "Elementary",
|
||||
"professional working proficiency" => "Professional working proficiency",
|
||||
"working proficiency" => "Working proficiency",
|
||||
"limited working proficiency" => "Limited working proficiency",
|
||||
"full professional proficiency" => "Full professional proficiency",
|
||||
_ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty),
|
||||
_ => compact,
|
||||
};
|
||||
}
|
||||
|
||||
private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB)
|
||||
{
|
||||
var endA = startA + sizeA;
|
||||
var endB = startB + sizeB;
|
||||
return startA < endB && startB < endA;
|
||||
}
|
||||
|
||||
private static Dictionary<string, string> BuildLanguageLookup()
|
||||
{
|
||||
var map = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
void Add(string? alias, string? canonical)
|
||||
{
|
||||
var normalizedAlias = NormalizeKey(alias);
|
||||
var normalizedCanonical = NormalizeDisplayName(canonical);
|
||||
if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return;
|
||||
map.TryAdd(normalizedAlias, normalizedCanonical);
|
||||
}
|
||||
|
||||
foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures))
|
||||
{
|
||||
var english = CleanCultureLanguageName(culture.EnglishName);
|
||||
var native = CleanCultureLanguageName(culture.NativeName);
|
||||
Add(english, english);
|
||||
Add(native, english);
|
||||
}
|
||||
|
||||
Add("norsk", "Norwegian");
|
||||
Add("bokmal", "Norwegian");
|
||||
Add("bokmål", "Norwegian");
|
||||
Add("nynorsk", "Norwegian");
|
||||
Add("mandarin", "Chinese");
|
||||
Add("cantonese", "Chinese");
|
||||
Add("farsi", "Persian");
|
||||
Add("persian", "Persian");
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
private static string? CleanCultureLanguageName(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||||
|
||||
var cleaned = value.Trim();
|
||||
var parenIndex = cleaned.IndexOf('(');
|
||||
if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim();
|
||||
var commaIndex = cleaned.IndexOf(',');
|
||||
if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim();
|
||||
return NormalizeDisplayName(cleaned);
|
||||
}
|
||||
|
||||
private static string? NormalizeDisplayName(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||||
var cleaned = Regex.Replace(value.Trim(), @"\s+", " ");
|
||||
return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
||||
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
|
||||
? word
|
||||
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()));
|
||||
}
|
||||
|
||||
private static string NormalizeKey(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
|
||||
var decomposed = value.Trim().Normalize(NormalizationForm.FormD);
|
||||
var builder = new StringBuilder(decomposed.Length);
|
||||
foreach (var ch in decomposed)
|
||||
{
|
||||
if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue;
|
||||
builder.Append(char.ToLowerInvariant(ch));
|
||||
}
|
||||
|
||||
return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim();
|
||||
}
|
||||
}
|
||||
@@ -144,7 +144,7 @@ public static class StructuredCvProfileJson
|
||||
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
|
||||
profile.Metadata ??= new StructuredCvMetadata();
|
||||
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
|
||||
profile.Contact ??= new StructuredCvContact();
|
||||
profile.Contact = NormalizeContact(profile.Contact);
|
||||
profile.Summary = CleanList(profile.Summary);
|
||||
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
|
||||
.Select(NormalizeJob)
|
||||
@@ -178,20 +178,206 @@ public static class StructuredCvProfileJson
|
||||
return profile;
|
||||
}
|
||||
|
||||
private static StructuredCvContact NormalizeContact(StructuredCvContact? contact)
|
||||
{
|
||||
contact ??= new StructuredCvContact();
|
||||
contact.FullName = TrimOrNull(contact.FullName);
|
||||
contact.Headline = TrimOrNull(contact.Headline);
|
||||
contact.Email = TrimOrNull(contact.Email);
|
||||
contact.Phone = TrimOrNull(contact.Phone);
|
||||
contact.Location = NormalizeLocationValue(contact.Location);
|
||||
contact.Website = NormalizeWebsite(contact.Website);
|
||||
contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn);
|
||||
return contact;
|
||||
}
|
||||
|
||||
private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
|
||||
{
|
||||
job ??= new StructuredCvJob();
|
||||
job.Title = TrimOrNull(job.Title);
|
||||
job.Company = TrimOrNull(job.Company);
|
||||
job.Location = TrimOrNull(job.Location);
|
||||
job.Start = TrimOrNull(job.Start);
|
||||
job.End = TrimOrNull(job.End);
|
||||
job.Bullets = CleanList(job.Bullets);
|
||||
|
||||
var title = NormalizeJobTitle(job.Title);
|
||||
var company = NormalizeCompanyName(job.Company);
|
||||
var location = NormalizeLocationValue(job.Location);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(title) && company is null)
|
||||
{
|
||||
var atSplit = Regex.Match(title, @"^(?<title>.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase);
|
||||
if (atSplit.Success)
|
||||
{
|
||||
title = NormalizeJobTitle(atSplit.Groups["title"].Value);
|
||||
company = NormalizeCompanyName(atSplit.Groups["company"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company))
|
||||
{
|
||||
var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title);
|
||||
var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company);
|
||||
if (titleLooksLikeCompany && companyLooksLikeTitle)
|
||||
{
|
||||
(title, company) = (company, title);
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title))
|
||||
{
|
||||
if (company is null) company = title;
|
||||
title = null;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null)
|
||||
{
|
||||
title = company;
|
||||
company = null;
|
||||
}
|
||||
|
||||
job.Title = title;
|
||||
job.Company = company;
|
||||
job.Location = location;
|
||||
job.Start = NormalizeDateValue(job.Start);
|
||||
job.End = NormalizeDateValue(job.End);
|
||||
job.Bullets = CleanList(job.Bullets)
|
||||
.Select(NormalizeBullet)
|
||||
.Where(bullet => bullet is not null)
|
||||
.Select(bullet => bullet!)
|
||||
.Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company))
|
||||
.ToList();
|
||||
job.Skills = CleanList(job.Skills);
|
||||
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||||
return job;
|
||||
}
|
||||
|
||||
private static string? NormalizeBullet(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||||
return value.Trim().TrimStart('-', '•', '*', ' ');
|
||||
}
|
||||
|
||||
private static bool IsUsefulJobBullet(string? value, string? title, string? company)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return false;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false;
|
||||
if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false;
|
||||
if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false;
|
||||
if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static string? NormalizeJobTitle(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
|
||||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeCompanyName(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||||
if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null;
|
||||
if (trimmed.Contains('.') && trimmed.Contains(' ')) return null;
|
||||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
|
||||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeLocationValue(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||||
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
|
||||
|
||||
var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||||
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||
if (parts.Length == 0 || parts.Length > 4) return null;
|
||||
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'’\-. ]+$"))) return null;
|
||||
|
||||
return string.Join(", ", parts);
|
||||
}
|
||||
|
||||
private static string? NormalizeWebsite(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
|
||||
|
||||
var candidate = trimmed;
|
||||
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
|
||||
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
|
||||
var host = uri.Host.Trim().Trim('.').ToLowerInvariant();
|
||||
if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null;
|
||||
return host;
|
||||
}
|
||||
|
||||
private static string? NormalizeLinkedIn(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
|
||||
var candidate = trimmed;
|
||||
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
|
||||
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
|
||||
if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
|
||||
|
||||
var path = uri.AbsolutePath.TrimEnd('/');
|
||||
if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null;
|
||||
return $"https://www.linkedin.com{path}";
|
||||
}
|
||||
|
||||
private static string? NormalizeDateValue(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null;
|
||||
}
|
||||
|
||||
private static bool LooksLikeDateRange(string value)
|
||||
{
|
||||
return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikeUrlOrEmail(string value)
|
||||
{
|
||||
return value.Contains('@')
|
||||
|| value.Contains("www.", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Contains("http://", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Contains("https://", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikeSectionHeading(string value)
|
||||
{
|
||||
return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Experience", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Employment History", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Education", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Skills", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Languages", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Interests", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Contact", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase)
|
||||
|| value.Equals("Summary", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikeJobTitle(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
|
||||
|
||||
return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase)
|
||||
|| (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value));
|
||||
}
|
||||
|
||||
private static bool LooksLikeCompanyName(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
|
||||
|
||||
return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase)
|
||||
|| value.Contains('&')
|
||||
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
|
||||
}
|
||||
|
||||
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
|
||||
{
|
||||
education ??= new StructuredCvEducation();
|
||||
@@ -207,8 +393,13 @@ public static class StructuredCvProfileJson
|
||||
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
|
||||
{
|
||||
language ??= new StructuredCvLanguage();
|
||||
language.Name = TrimOrNull(language.Name);
|
||||
language.Level = TrimOrNull(language.Level);
|
||||
|
||||
var originalName = TrimOrNull(language.Name);
|
||||
var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName);
|
||||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName);
|
||||
|
||||
language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null;
|
||||
language.Level = normalizedLevel;
|
||||
language.Notes = TrimOrNull(language.Notes);
|
||||
return language;
|
||||
}
|
||||
@@ -360,7 +551,13 @@ public static class StructuredCvProfileJson
|
||||
}
|
||||
}
|
||||
|
||||
return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes };
|
||||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
|
||||
return new StructuredCvLanguage
|
||||
{
|
||||
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
|
||||
Level = normalizedLevel,
|
||||
Notes = notes,
|
||||
};
|
||||
})
|
||||
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
|
||||
.ToList();
|
||||
|
||||
Reference in New Issue
Block a user