refactor, security updates, cv extraction upgrades
This commit is contained in:
@@ -670,10 +670,62 @@ public static class StructuredCvProfileJson
|
||||
}
|
||||
}
|
||||
|
||||
var leftovers = lines.Where(line => !line.Contains('@') && !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase)).ToList();
|
||||
if (leftovers.Count > 0) contact.FullName ??= leftovers[0].Trim();
|
||||
if (leftovers.Count > 1) contact.Headline ??= leftovers[1].Trim();
|
||||
if (leftovers.Count > 2) contact.Location ??= leftovers[2].Trim();
|
||||
var leftovers = lines.Where(line => !line.Contains('@')
|
||||
&& !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase)
|
||||
&& !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase)
|
||||
&& !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase))
|
||||
.ToList();
|
||||
|
||||
var plausibleName = leftovers.FirstOrDefault(line => LooksLikePersonName(line));
|
||||
contact.FullName ??= plausibleName?.Trim();
|
||||
contact.FullName ??= GuessNameFromLinkedIn(contact.LinkedIn);
|
||||
contact.FullName ??= GuessNameFromEmail(contact.Email);
|
||||
|
||||
var remaining = leftovers.Where(line => !string.Equals(line, contact.FullName, StringComparison.OrdinalIgnoreCase)).ToList();
|
||||
var addressLike = remaining.Where(LooksLikeAddressish).ToList();
|
||||
if (remaining.Count > 1 && !LooksLikeAddressish(remaining[0])) contact.Headline ??= remaining[0].Trim();
|
||||
contact.Location ??= addressLike.LastOrDefault()?.Trim();
|
||||
if (string.IsNullOrWhiteSpace(contact.Location))
|
||||
{
|
||||
var nonHeadline = remaining.Where(line => !string.Equals(line, contact.Headline, StringComparison.OrdinalIgnoreCase)).ToList();
|
||||
contact.Location ??= nonHeadline.LastOrDefault()?.Trim();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool LooksLikeAddressish(string value)
|
||||
{
|
||||
return value.Any(char.IsDigit)
|
||||
|| Regex.IsMatch(value, @"\b(street|st\.?|road|rd\.?|avenue|ave\.?|suite|city|london|new york|oslo|uk|ny)\b", RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikePersonName(string value)
|
||||
{
|
||||
return Regex.IsMatch(value.Trim(), @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,3}$");
|
||||
}
|
||||
|
||||
private static string? GuessNameFromLinkedIn(string? linkedIn)
|
||||
{
|
||||
var value = TrimOrNull(linkedIn);
|
||||
if (value is null) return null;
|
||||
var match = Regex.Match(value, @"linkedin\.com/(?:in|pub)/(?<slug>[a-z0-9._-]+)", RegexOptions.IgnoreCase);
|
||||
if (!match.Success) return null;
|
||||
var parts = Regex.Split(match.Groups["slug"].Value, @"[._-]+")
|
||||
.Where(part => !string.IsNullOrWhiteSpace(part) && part.All(ch => char.IsLetter(ch)))
|
||||
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
|
||||
.ToList();
|
||||
return parts.Count >= 2 ? string.Join(" ", parts) : null;
|
||||
}
|
||||
|
||||
private static string? GuessNameFromEmail(string? email)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
|
||||
var local = email[..email.IndexOf('@')].Trim();
|
||||
if (string.IsNullOrWhiteSpace(local)) return null;
|
||||
var parts = Regex.Split(local, @"[._-]+", RegexOptions.None)
|
||||
.Where(part => !string.IsNullOrWhiteSpace(part))
|
||||
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
|
||||
.ToList();
|
||||
return parts.Count >= 2 ? string.Join(" ", parts) : null;
|
||||
}
|
||||
|
||||
private static List<StructuredCvLanguage> ParseLanguages(string content)
|
||||
@@ -681,15 +733,16 @@ public static class StructuredCvProfileJson
|
||||
return SplitList(content)
|
||||
.Select(item =>
|
||||
{
|
||||
var name = item;
|
||||
var normalized = item.Trim();
|
||||
var name = normalized;
|
||||
string? level = null;
|
||||
string? notes = null;
|
||||
|
||||
var colonIndex = item.IndexOf(':');
|
||||
var colonIndex = normalized.IndexOf(':');
|
||||
if (colonIndex > 0)
|
||||
{
|
||||
name = item[..colonIndex].Trim();
|
||||
var remainder = item[(colonIndex + 1)..].Trim();
|
||||
name = normalized[..colonIndex].Trim();
|
||||
var remainder = normalized[(colonIndex + 1)..].Trim();
|
||||
var noteMatch = Regex.Match(remainder, @"^(.*?)\s*\((.*?)\)$");
|
||||
if (noteMatch.Success)
|
||||
{
|
||||
@@ -701,8 +754,26 @@ public static class StructuredCvProfileJson
|
||||
level = remainder.NullIfWhitespace();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
var dashMatch = Regex.Match(normalized, @"^(?<name>[\p{L}][\p{L}\s-]+?)\s*[–-]\s*(?<level>.+)$");
|
||||
if (dashMatch.Success)
|
||||
{
|
||||
name = dashMatch.Groups["name"].Value.Trim();
|
||||
level = dashMatch.Groups["level"].Value.Trim();
|
||||
}
|
||||
else
|
||||
{
|
||||
var parenMatch = Regex.Match(normalized, @"^(?<name>[\p{L}][\p{L}\s-]+?)\s*\((?<level>.+)\)$");
|
||||
if (parenMatch.Success)
|
||||
{
|
||||
name = parenMatch.Groups["name"].Value.Trim();
|
||||
level = parenMatch.Groups["level"].Value.Trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
|
||||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(normalized);
|
||||
return new StructuredCvLanguage
|
||||
{
|
||||
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
|
||||
@@ -729,11 +800,20 @@ public static class StructuredCvProfileJson
|
||||
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
|
||||
job.Title = lines[0].NullIfWhitespace();
|
||||
|
||||
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
|
||||
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)(?:\s*[-–]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
|
||||
if (!string.IsNullOrWhiteSpace(dateValue))
|
||||
var titleDateMatch = Regex.Match(job.Title ?? string.Empty, @"(?<title>.+?)\s*[-–]\s*(?<start>(?:\d{1,2}/)?\d{4})\s*(?:to|[-–])\s*(?<end>(?:\d{1,2}/)?\d{4}|Present|Current)$", RegexOptions.IgnoreCase);
|
||||
if (titleDateMatch.Success)
|
||||
{
|
||||
var parts = Regex.Split(dateValue, "\\s*[-–]\\s*");
|
||||
job.Title = titleDateMatch.Groups["title"].Value.NullIfWhitespace();
|
||||
job.Start = titleDateMatch.Groups["start"].Value.NullIfWhitespace();
|
||||
job.End = titleDateMatch.Groups["end"].Value.NullIfWhitespace();
|
||||
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
|
||||
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\d{1,2}/)?\d{4}|Present|Current)(?:\s*(?:[-–]|to)\s*(?:(?:\d{1,2}/)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
|
||||
if (!string.IsNullOrWhiteSpace(dateValue) && string.IsNullOrWhiteSpace(job.Start))
|
||||
{
|
||||
var parts = Regex.Split(dateValue, "\\s*(?:[-–]|to)\\s*");
|
||||
job.Start = parts.FirstOrDefault().NullIfWhitespace();
|
||||
job.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
|
||||
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||||
@@ -752,10 +832,32 @@ public static class StructuredCvProfileJson
|
||||
.Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase))
|
||||
.SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..]))
|
||||
.ToList();
|
||||
if (job.Skills.Count == 0)
|
||||
{
|
||||
job.Skills = job.Bullets
|
||||
.SelectMany(ExtractSkillsFromBullet)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
return string.IsNullOrWhiteSpace(job.Title) && string.IsNullOrWhiteSpace(job.Company) && job.Bullets.Count == 0 ? null : job;
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ExtractSkillsFromBullet(string bullet)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(bullet)) yield break;
|
||||
|
||||
var usingMatch = Regex.Match(bullet, @"\b(?:using|including|with|technologies?:|tools?:)\s+(?<skills>.+)$", RegexOptions.IgnoreCase);
|
||||
if (usingMatch.Success)
|
||||
{
|
||||
foreach (var item in SplitList(usingMatch.Groups["skills"].Value))
|
||||
{
|
||||
var trimmed = item.Trim().TrimEnd('.');
|
||||
if (trimmed.Length >= 2 && trimmed.Length <= 40) yield return trimmed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<StructuredCvEducation> ParseEducation(string content)
|
||||
{
|
||||
var blocks = SplitBlocks(content);
|
||||
|
||||
Reference in New Issue
Block a user