163 lines
6.3 KiB
C#
163 lines
6.3 KiB
C#
using System.Globalization;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace JobTrackerApi.Models;
|
|
|
|
public static class HumanLanguageCatalog
|
|
{
|
|
private static readonly Dictionary<string, string> LanguageLookup = BuildLanguageLookup();
|
|
|
|
private static readonly Regex WordRegex = new(@"\p{L}+", RegexOptions.Compiled);
|
|
|
|
private static readonly Regex LevelRegex = new(
|
|
@"\b(native(?:\s+speaker)?|fluent|advanced|intermediate|beginner|basic|conversational|elementary|professional\s+working\s+proficiency|working\s+proficiency|limited\s+working\s+proficiency|full\s+professional\s+proficiency|a1|a2|b1|b2|c1|c2|a1\s*/\s*a2|a2\s*/\s*b1|b1\s*/\s*b2|b2\s*/\s*c1|c1\s*/\s*c2)\b",
|
|
RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
|
|
public static string? NormalizeLanguageName(string? raw)
|
|
{
|
|
var matches = ExtractLanguageNames(raw);
|
|
return matches.Count == 1 ? matches[0] : null;
|
|
}
|
|
|
|
public static IReadOnlyList<string> ExtractLanguageNames(string? raw)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(raw)) return Array.Empty<string>();
|
|
|
|
var words = WordRegex.Matches(raw)
|
|
.Select(match => match.Value)
|
|
.Where(value => !string.IsNullOrWhiteSpace(value))
|
|
.ToList();
|
|
|
|
if (words.Count == 0) return Array.Empty<string>();
|
|
|
|
var matches = new List<(int Start, int Size, string Canonical)>();
|
|
for (var size = Math.Min(4, words.Count); size >= 1; size--)
|
|
{
|
|
for (var start = 0; start <= words.Count - size; start++)
|
|
{
|
|
var phrase = string.Join(" ", words.Skip(start).Take(size));
|
|
if (!LanguageLookup.TryGetValue(NormalizeKey(phrase), out var canonical)) continue;
|
|
if (matches.Any(existing => RangesOverlap(existing.Start, existing.Size, start, size))) continue;
|
|
matches.Add((start, size, canonical));
|
|
}
|
|
}
|
|
|
|
return matches
|
|
.OrderBy(match => match.Start)
|
|
.Select(match => match.Canonical)
|
|
.Distinct(StringComparer.OrdinalIgnoreCase)
|
|
.ToList();
|
|
}
|
|
|
|
public static bool HasRecognizedLevel(string? raw)
|
|
{
|
|
return ExtractLevel(raw) is not null;
|
|
}
|
|
|
|
public static string? ExtractLevel(string? raw)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(raw)) return null;
|
|
|
|
var match = LevelRegex.Match(raw);
|
|
if (!match.Success) return null;
|
|
|
|
var value = match.Groups[1].Value.Trim();
|
|
var compact = Regex.Replace(value, @"\s+", " ");
|
|
return compact.ToLowerInvariant() switch
|
|
{
|
|
"native speaker" => "Native",
|
|
"native" => "Native",
|
|
"fluent" => "Fluent",
|
|
"advanced" => "Advanced",
|
|
"intermediate" => "Intermediate",
|
|
"beginner" => "Beginner",
|
|
"basic" => "Basic",
|
|
"conversational" => "Conversational",
|
|
"elementary" => "Elementary",
|
|
"professional working proficiency" => "Professional working proficiency",
|
|
"working proficiency" => "Working proficiency",
|
|
"limited working proficiency" => "Limited working proficiency",
|
|
"full professional proficiency" => "Full professional proficiency",
|
|
_ when Regex.IsMatch(compact, @"^[ABC][12](?:\s*/\s*[ABC][12])?$", RegexOptions.IgnoreCase) => compact.ToUpperInvariant().Replace(" ", string.Empty),
|
|
_ => compact,
|
|
};
|
|
}
|
|
|
|
private static bool RangesOverlap(int startA, int sizeA, int startB, int sizeB)
|
|
{
|
|
var endA = startA + sizeA;
|
|
var endB = startB + sizeB;
|
|
return startA < endB && startB < endA;
|
|
}
|
|
|
|
private static Dictionary<string, string> BuildLanguageLookup()
|
|
{
|
|
var map = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
|
|
|
void Add(string? alias, string? canonical)
|
|
{
|
|
var normalizedAlias = NormalizeKey(alias);
|
|
var normalizedCanonical = NormalizeDisplayName(canonical);
|
|
if (string.IsNullOrWhiteSpace(normalizedAlias) || string.IsNullOrWhiteSpace(normalizedCanonical)) return;
|
|
map.TryAdd(normalizedAlias, normalizedCanonical);
|
|
}
|
|
|
|
foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures | CultureTypes.SpecificCultures))
|
|
{
|
|
var english = CleanCultureLanguageName(culture.EnglishName);
|
|
var native = CleanCultureLanguageName(culture.NativeName);
|
|
Add(english, english);
|
|
Add(native, english);
|
|
}
|
|
|
|
Add("norsk", "Norwegian");
|
|
Add("bokmal", "Norwegian");
|
|
Add("bokmål", "Norwegian");
|
|
Add("nynorsk", "Norwegian");
|
|
Add("mandarin", "Chinese");
|
|
Add("cantonese", "Chinese");
|
|
Add("farsi", "Persian");
|
|
Add("persian", "Persian");
|
|
|
|
return map;
|
|
}
|
|
|
|
private static string? CleanCultureLanguageName(string? value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value)) return null;
|
|
|
|
var cleaned = value.Trim();
|
|
var parenIndex = cleaned.IndexOf('(');
|
|
if (parenIndex > 0) cleaned = cleaned[..parenIndex].Trim();
|
|
var commaIndex = cleaned.IndexOf(',');
|
|
if (commaIndex > 0) cleaned = cleaned[..commaIndex].Trim();
|
|
return NormalizeDisplayName(cleaned);
|
|
}
|
|
|
|
private static string? NormalizeDisplayName(string? value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value)) return null;
|
|
var cleaned = Regex.Replace(value.Trim(), @"\s+", " ");
|
|
return string.Join(" ", cleaned.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
|
.Select(word => word.Length <= 3 && word.All(char.IsUpper)
|
|
? word
|
|
: char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant()));
|
|
}
|
|
|
|
private static string NormalizeKey(string? value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
|
|
|
var decomposed = value.Trim().Normalize(NormalizationForm.FormD);
|
|
var builder = new StringBuilder(decomposed.Length);
|
|
foreach (var ch in decomposed)
|
|
{
|
|
if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) continue;
|
|
builder.Append(char.ToLowerInvariant(ch));
|
|
}
|
|
|
|
return Regex.Replace(builder.ToString().Normalize(NormalizationForm.FormC), @"[^\p{L}]+", " ").Trim();
|
|
}
|
|
}
|