using System.Globalization; using System.Text; using System.Text.RegularExpressions; namespace JobTrackerApi.Models; public static class HumanLanguageCatalog { private static readonly Dictionary LanguageLookup = BuildLanguageLookup(); private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled); private static readonly Regex LevelRegex = new( @"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled); public static string? NormalizeLanguageName(string? raw) { var matches = ExtractLanguageNames(raw); return matches.Count == 1 ? matches[0] : null; } public static IReadOnlyList ExtractLanguageNames(string? raw) { if (string.IsNullOrWhiteSpace(raw)) return Array.Empty(); var words = WordRegex.Matches(raw) .Select(match => match.Value) .Where(value => !string.IsNullOrWhiteSpace(value)) .ToList(); if (words.Count == 0) return Array.Empty(); var matches = new List<(int Start, int Size, string Canonical)>(); for (var size = Math.Min(4, words.Count); size >= 1; size--) { for (var start = 0; start <= words.Count - size; start++) { var phrase = string.Join(" ", words.Skip(start).Take(size)); if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue; if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue; matches.Add((start, size, canonical)); } } return matches .OrderBy(match => match.Start) .Select(match => match.Canonical) .Distinct(StringComparer.OrdinalIgnoreCase) .ToList(); } public static bool HasRecognizedLevel(string? raw) { return ExtractLevel(raw) is not null; } public static string? ExtractLevel(string? raw) { if (string.IsNullOrWhiteSpace(raw)) return null; var match = LevelRegex.Match(raw); if (!match.Success) return null; var value = match.Groups[1].Value.Trim(); var compact = Regex.Replace(value, @"\s+", " "); return compact.ToLowerInvariant() switch { "native speaker" => "Native", "native" => "Native", "fluent" => "Fluent", "advanced" => "Advanced", "intermediate" => "Intermediate", "beginner" => "Beginner", "basic" => "Basic", "conversational" => "Conversational", "elementary" => "Elementary", "professional working proficiency" => "Professional working proficiency", "working proficiency" => "Working proficiency", "limited working proficiency" => "Limited working proficiency", "full professional proficiency" => "Full professional proficiency", _ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty), _ => compact, }; } private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB) { var endA = startA + sizeA; var endB = startB + sizeB; return startA < endB && startB < endA; } private static Dictionary BuildLanguageLookup() { var map = new Dictionary(StringComparer.OrdinalIgnoreCase); void Add(string? alias, string? canonical) { var normalizedAlias = NormalizeKey(alias); var normalizedCanonical = NormalizeDisplayName(canonical); if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return; map.TryAdd(normalizedAlias, normalizedCanonical); } foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures)) { var english = CleanCultureLanguageName(culture.EnglishName); var native = CleanCultureLanguageName(culture.NativeName); Add(english, english); Add(native, english); } Add("norsk", "Norwegian"); Add("bokmal", "Norwegian"); Add("bokmål", "Norwegian"); Add("nynorsk", "Norwegian"); Add("mandarin", "Chinese"); Add("cantonese", "Chinese"); Add("farsi", "Persian"); Add("persian", "Persian"); return map; } private static string? CleanCultureLanguageName(string? value) { if (string.IsNullOrWhiteSpace(value)) return null; var cleaned = value.Trim(); var parenIndex = cleaned.IndexOf('('); if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim(); var commaIndex = cleaned.IndexOf(','); if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim(); return NormalizeDisplayName(cleaned); } private static string? NormalizeDisplayName(string? value) { if (string.IsNullOrWhiteSpace(value)) return null; var cleaned = Regex.Replace(value.Trim(), @"\s+", " "); return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries) .Select(word => word.Length <= 3 && word.All(char.IsUpper) ? word : char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())); } private static string NormalizeKey(string? value) { if (string.IsNullOrWhiteSpace(value)) return string.Empty; var decomposed = value.Trim().Normalize(NormalizationForm.FormD); var builder = new StringBuilder(decomposed.Length); foreach (var ch in decomposed) { if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue; builder.Append(char.ToLowerInvariant(ch)); } return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim(); } }