Improve CV rewrite flow and parser accuracy

This commit is contained in:
2026-04-01 11:30:37 +02:00
parent f402213526
commit f22c6791a7
9 changed files with 581 additions and 55 deletions
+70 -8
View File
@@ -12,6 +12,13 @@ public static class StructuredCvProfileJson
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
};
private static readonly HashSet<string> NonLocationTokens = new(StringComparer.OrdinalIgnoreCase)
{
"python", "ruby", "sql", "mysql", "postgresql", "postgres", "sqlite", "javascript", "typescript",
"react", "node", "node.js", "c#", ".net", "asp.net", "java", "azure", "aws", "gcp", "docker",
"kubernetes", "terraform", "git", "github", "gitlab", "ci/cd", "rest", "graphql", "php", "golang", "go"
};
public static StructuredCvProfile Empty() => Normalize(new StructuredCvProfile());
public static StructuredCvProfile Deserialize(string? json)
@@ -291,10 +298,12 @@ public static class StructuredCvProfileJson
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
var normalized = Regex.Replace(trimmed, @"\s+[A-Z](?:\s+[A-Z]){2,}(?:\b.*)?$", string.Empty).Trim();
normalized = Regex.Replace(normalized, @"\s+", " ").Trim(' ', '|', ';', ':');
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
if (parts.Length == 0 || parts.Length > 4) return null;
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'\-. ]+$"))) return null;
if (parts.Any(LooksLikeSkillToken)) return null;
return string.Join(", ", parts);
}
@@ -378,15 +387,60 @@ public static class StructuredCvProfileJson
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
}
private static bool LooksLikeSkillToken(string value)
{
var normalized = TrimOrNull(value)?.Trim('.', ' ');
return normalized is not null && NonLocationTokens.Contains(normalized);
}
private static bool LooksLikeQualification(string value)
{
return Regex.IsMatch(value, @"\b(level\s*\d+|nvq|btec|gcse|a-?level|diploma|certificate|certification|bachelor(?:'s)?|master(?:'s)?|phd|doctorate|mba|ba|bsc|msc|ma|associate|apprenticeship|degree|ict)\b", RegexOptions.IgnoreCase);
}
private static bool LooksLikeInstitutionName(string value)
{
return Regex.IsMatch(value, @"\b(university|college|school|academy|institute|faculty|campus|council|polytechnic)\b", RegexOptions.IgnoreCase);
}
private static string? NormalizeQualification(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeUrlOrEmail(trimmed) || LooksLikeSectionHeading(trimmed)) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static string? NormalizeInstitution(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeUrlOrEmail(trimmed) || LooksLikeSectionHeading(trimmed)) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
{
education ??= new StructuredCvEducation();
education.Qualification = TrimOrNull(education.Qualification);
education.Institution = TrimOrNull(education.Institution);
education.Location = TrimOrNull(education.Location);
education.Start = TrimOrNull(education.Start);
education.End = TrimOrNull(education.End);
education.Qualification = NormalizeQualification(education.Qualification);
education.Institution = NormalizeInstitution(education.Institution);
education.Location = NormalizeLocationValue(education.Location);
education.Start = NormalizeDateValue(education.Start);
education.End = NormalizeDateValue(education.End);
education.Details = CleanList(education.Details);
if (!string.IsNullOrWhiteSpace(education.Qualification) && !string.IsNullOrWhiteSpace(education.Institution))
{
var qualificationLooksInstitutional = LooksLikeInstitutionName(education.Qualification) && !LooksLikeQualification(education.Qualification);
var institutionLooksQualification = LooksLikeQualification(education.Institution) && !LooksLikeInstitutionName(education.Institution);
if (qualificationLooksInstitutional && institutionLooksQualification)
{
(education.Qualification, education.Institution) = (education.Institution, education.Qualification);
}
}
return education;
}
@@ -588,7 +642,11 @@ public static class StructuredCvProfileJson
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
var metadataWithoutDates = metadata
.Select(line => string.IsNullOrWhiteSpace(dateValue) ? line : line.Replace(dateValue, string.Empty))
.Select(line => line.Trim(' ', '|', ',', '-'))
.Where(line => !string.IsNullOrWhiteSpace(line))
.ToList();
if (metadataWithoutDates.Count > 0) job.Company = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) job.Location = metadataWithoutDates[1].NullIfWhitespace();
@@ -625,7 +683,11 @@ public static class StructuredCvProfileJson
education.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
var metadataWithoutDates = metadata
.Select(line => string.IsNullOrWhiteSpace(dateValue) ? line : line.Replace(dateValue, string.Empty))
.Select(line => line.Trim(' ', '|', ',', '-'))
.Where(line => !string.IsNullOrWhiteSpace(line))
.ToList();
if (metadataWithoutDates.Count > 0) education.Institution = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) education.Location = metadataWithoutDates[1].NullIfWhitespace();