refactor, security updates, cv extraction upgrades

This commit is contained in:
2026-04-11 01:34:32 +02:00
parent 806b200ac5
commit 27fd70a2d7
59 changed files with 6817 additions and 1561 deletions
+115 -13
View File
@@ -670,10 +670,62 @@ public static class StructuredCvProfileJson
}
}
var leftovers = lines.Where(line => !line.Contains('@') && !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase)).ToList();
if (leftovers.Count > 0) contact.FullName ??= leftovers[0].Trim();
if (leftovers.Count > 1) contact.Headline ??= leftovers[1].Trim();
if (leftovers.Count > 2) contact.Location ??= leftovers[2].Trim();
var leftovers = lines.Where(line => !line.Contains('@')
&& !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase)
&& !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase)
&& !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase))
.ToList();
var plausibleName = leftovers.FirstOrDefault(line => LooksLikePersonName(line));
contact.FullName ??= plausibleName?.Trim();
contact.FullName ??= GuessNameFromLinkedIn(contact.LinkedIn);
contact.FullName ??= GuessNameFromEmail(contact.Email);
var remaining = leftovers.Where(line => !string.Equals(line, contact.FullName, StringComparison.OrdinalIgnoreCase)).ToList();
var addressLike = remaining.Where(LooksLikeAddressish).ToList();
if (remaining.Count > 1 && !LooksLikeAddressish(remaining[0])) contact.Headline ??= remaining[0].Trim();
contact.Location ??= addressLike.LastOrDefault()?.Trim();
if (string.IsNullOrWhiteSpace(contact.Location))
{
var nonHeadline = remaining.Where(line => !string.Equals(line, contact.Headline, StringComparison.OrdinalIgnoreCase)).ToList();
contact.Location ??= nonHeadline.LastOrDefault()?.Trim();
}
}
private static bool LooksLikeAddressish(string value)
{
return value.Any(char.IsDigit)
|| Regex.IsMatch(value, @"\b(street|st\.?|road|rd\.?|avenue|ave\.?|suite|city|london|new york|oslo|uk|ny)\b", RegexOptions.IgnoreCase);
}
private static bool LooksLikePersonName(string value)
{
return Regex.IsMatch(value.Trim(), @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,3}$");
}
private static string? GuessNameFromLinkedIn(string? linkedIn)
{
var value = TrimOrNull(linkedIn);
if (value is null) return null;
var match = Regex.Match(value, @"linkedin\.com/(?:in|pub)/(?<slug>[a-z0-9._-]+)", RegexOptions.IgnoreCase);
if (!match.Success) return null;
var parts = Regex.Split(match.Groups["slug"].Value, @"[._-]+")
.Where(part => !string.IsNullOrWhiteSpace(part) && part.All(ch => char.IsLetter(ch)))
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
.ToList();
return parts.Count >= 2 ? string.Join(" ", parts) : null;
}
private static string? GuessNameFromEmail(string? email)
{
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
var local = email[..email.IndexOf('@')].Trim();
if (string.IsNullOrWhiteSpace(local)) return null;
var parts = Regex.Split(local, @"[._-]+", RegexOptions.None)
.Where(part => !string.IsNullOrWhiteSpace(part))
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
.ToList();
return parts.Count >= 2 ? string.Join(" ", parts) : null;
}
private static List<StructuredCvLanguage> ParseLanguages(string content)
@@ -681,15 +733,16 @@ public static class StructuredCvProfileJson
return SplitList(content)
.Select(item =>
{
var name = item;
var normalized = item.Trim();
var name = normalized;
string? level = null;
string? notes = null;
var colonIndex = item.IndexOf(':');
var colonIndex = normalized.IndexOf(':');
if (colonIndex > 0)
{
name = item[..colonIndex].Trim();
var remainder = item[(colonIndex + 1)..].Trim();
name = normalized[..colonIndex].Trim();
var remainder = normalized[(colonIndex + 1)..].Trim();
var noteMatch = Regex.Match(remainder, @"^(.*?)\s*\((.*?)\)$");
if (noteMatch.Success)
{
@@ -701,8 +754,26 @@ public static class StructuredCvProfileJson
level = remainder.NullIfWhitespace();
}
}
else
{
var dashMatch = Regex.Match(normalized, @"^(?<name>[\p{L}][\p{L}\s-]+?)\s*[-]\s*(?<level>.+)$");
if (dashMatch.Success)
{
name = dashMatch.Groups["name"].Value.Trim();
level = dashMatch.Groups["level"].Value.Trim();
}
else
{
var parenMatch = Regex.Match(normalized, @"^(?<name>[\p{L}][\p{L}\s-]+?)\s*\((?<level>.+)\)$");
if (parenMatch.Success)
{
name = parenMatch.Groups["name"].Value.Trim();
level = parenMatch.Groups["level"].Value.Trim();
}
}
}
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(normalized);
return new StructuredCvLanguage
{
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
@@ -729,11 +800,20 @@ public static class StructuredCvProfileJson
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
job.Title = lines[0].NullIfWhitespace();
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)(?:\s*[-]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue))
var titleDateMatch = Regex.Match(job.Title ?? string.Empty, @"(?<title>.+?)\s*[-]\s*(?<start>(?:\d{1,2}/)?\d{4})\s*(?:to|[-])\s*(?<end>(?:\d{1,2}/)?\d{4}|Present|Current)$", RegexOptions.IgnoreCase);
if (titleDateMatch.Success)
{
var parts = Regex.Split(dateValue, "\\s*[-]\\s*");
job.Title = titleDateMatch.Groups["title"].Value.NullIfWhitespace();
job.Start = titleDateMatch.Groups["start"].Value.NullIfWhitespace();
job.End = titleDateMatch.Groups["end"].Value.NullIfWhitespace();
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
}
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\d{1,2}/)?\d{4}|Present|Current)(?:\s*(?:[-]|to)\s*(?:(?:\d{1,2}/)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue) && string.IsNullOrWhiteSpace(job.Start))
{
var parts = Regex.Split(dateValue, "\\s*(?:[-]|to)\\s*");
job.Start = parts.FirstOrDefault().NullIfWhitespace();
job.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
@@ -752,10 +832,32 @@ public static class StructuredCvProfileJson
.Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase))
.SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..]))
.ToList();
if (job.Skills.Count == 0)
{
job.Skills = job.Bullets
.SelectMany(ExtractSkillsFromBullet)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
return string.IsNullOrWhiteSpace(job.Title) && string.IsNullOrWhiteSpace(job.Company) && job.Bullets.Count == 0 ? null : job;
}
private static IEnumerable<string> ExtractSkillsFromBullet(string bullet)
{
if (string.IsNullOrWhiteSpace(bullet)) yield break;
var usingMatch = Regex.Match(bullet, @"\b(?:using|including|with|technologies?:|tools?:)\s+(?<skills>.+)$", RegexOptions.IgnoreCase);
if (usingMatch.Success)
{
foreach (var item in SplitList(usingMatch.Groups["skills"].Value))
{
var trimmed = item.Trim().TrimEnd('.');
if (trimmed.Length >= 2 && trimmed.Length <= 40) yield return trimmed;
}
}
}
private static List<StructuredCvEducation> ParseEducation(string content)
{
var blocks = SplitBlocks(content);