Improve CV rewrite flow and parser accuracy
This commit is contained in:
@@ -12,6 +12,13 @@ public static class StructuredCvProfileJson
|
||||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
|
||||
};
|
||||
|
||||
private static readonly HashSet<string> NonLocationTokens = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"python", "ruby", "sql", "mysql", "postgresql", "postgres", "sqlite", "javascript", "typescript",
|
||||
"react", "node", "node.js", "c#", ".net", "asp.net", "java", "azure", "aws", "gcp", "docker",
|
||||
"kubernetes", "terraform", "git", "github", "gitlab", "ci/cd", "rest", "graphql", "php", "golang", "go"
|
||||
};
|
||||
|
||||
public static StructuredCvProfile Empty() => Normalize(new StructuredCvProfile());
|
||||
|
||||
public static StructuredCvProfile Deserialize(string? json)
|
||||
@@ -291,10 +298,12 @@ public static class StructuredCvProfileJson
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||||
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
|
||||
|
||||
var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||||
var normalized = Regex.Replace(trimmed, @"\s+[A-Z](?:\s+[A-Z]){2,}(?:\b.*)?$", string.Empty).Trim();
|
||||
normalized = Regex.Replace(normalized, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||||
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||
if (parts.Length == 0 || parts.Length > 4) return null;
|
||||
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'’\-. ]+$"))) return null;
|
||||
if (parts.Any(LooksLikeSkillToken)) return null;
|
||||
|
||||
return string.Join(", ", parts);
|
||||
}
|
||||
@@ -378,15 +387,60 @@ public static class StructuredCvProfileJson
|
||||
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
|
||||
}
|
||||
|
||||
private static bool LooksLikeSkillToken(string value)
|
||||
{
|
||||
var normalized = TrimOrNull(value)?.Trim('.', ' ');
|
||||
return normalized is not null && NonLocationTokens.Contains(normalized);
|
||||
}
|
||||
|
||||
private static bool LooksLikeQualification(string value)
|
||||
{
|
||||
return Regex.IsMatch(value, @"\b(level\s*\d+|nvq|btec|gcse|a-?level|diploma|certificate|certification|bachelor(?:'s)?|master(?:'s)?|phd|doctorate|mba|ba|bsc|msc|ma|associate|apprenticeship|degree|ict)\b", RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
private static bool LooksLikeInstitutionName(string value)
|
||||
{
|
||||
return Regex.IsMatch(value, @"\b(university|college|school|academy|institute|faculty|campus|council|polytechnic)\b", RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
private static string? NormalizeQualification(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeUrlOrEmail(trimmed) || LooksLikeSectionHeading(trimmed)) return null;
|
||||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeInstitution(string? value)
|
||||
{
|
||||
var trimmed = TrimOrNull(value);
|
||||
if (trimmed is null) return null;
|
||||
if (LooksLikeDateRange(trimmed) || LooksLikeUrlOrEmail(trimmed) || LooksLikeSectionHeading(trimmed)) return null;
|
||||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||||
}
|
||||
|
||||
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
|
||||
{
|
||||
education ??= new StructuredCvEducation();
|
||||
education.Qualification = TrimOrNull(education.Qualification);
|
||||
education.Institution = TrimOrNull(education.Institution);
|
||||
education.Location = TrimOrNull(education.Location);
|
||||
education.Start = TrimOrNull(education.Start);
|
||||
education.End = TrimOrNull(education.End);
|
||||
education.Qualification = NormalizeQualification(education.Qualification);
|
||||
education.Institution = NormalizeInstitution(education.Institution);
|
||||
education.Location = NormalizeLocationValue(education.Location);
|
||||
education.Start = NormalizeDateValue(education.Start);
|
||||
education.End = NormalizeDateValue(education.End);
|
||||
education.Details = CleanList(education.Details);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(education.Qualification) && !string.IsNullOrWhiteSpace(education.Institution))
|
||||
{
|
||||
var qualificationLooksInstitutional = LooksLikeInstitutionName(education.Qualification) && !LooksLikeQualification(education.Qualification);
|
||||
var institutionLooksQualification = LooksLikeQualification(education.Institution) && !LooksLikeInstitutionName(education.Institution);
|
||||
if (qualificationLooksInstitutional && institutionLooksQualification)
|
||||
{
|
||||
(education.Qualification, education.Institution) = (education.Institution, education.Qualification);
|
||||
}
|
||||
}
|
||||
|
||||
return education;
|
||||
}
|
||||
|
||||
@@ -588,7 +642,11 @@ public static class StructuredCvProfileJson
|
||||
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
|
||||
var metadataWithoutDates = metadata
|
||||
.Select(line => string.IsNullOrWhiteSpace(dateValue) ? line : line.Replace(dateValue, string.Empty))
|
||||
.Select(line => line.Trim(' ', '|', ',', '-'))
|
||||
.Where(line => !string.IsNullOrWhiteSpace(line))
|
||||
.ToList();
|
||||
if (metadataWithoutDates.Count > 0) job.Company = metadataWithoutDates[0].NullIfWhitespace();
|
||||
if (metadataWithoutDates.Count > 1) job.Location = metadataWithoutDates[1].NullIfWhitespace();
|
||||
|
||||
@@ -625,7 +683,11 @@ public static class StructuredCvProfileJson
|
||||
education.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
|
||||
}
|
||||
|
||||
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
|
||||
var metadataWithoutDates = metadata
|
||||
.Select(line => string.IsNullOrWhiteSpace(dateValue) ? line : line.Replace(dateValue, string.Empty))
|
||||
.Select(line => line.Trim(' ', '|', ',', '-'))
|
||||
.Where(line => !string.IsNullOrWhiteSpace(line))
|
||||
.ToList();
|
||||
if (metadataWithoutDates.Count > 0) education.Institution = metadataWithoutDates[0].NullIfWhitespace();
|
||||
if (metadataWithoutDates.Count > 1) education.Location = metadataWithoutDates[1].NullIfWhitespace();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user