1039 lines
51 KiB
C#
1039 lines
51 KiB
C#
using System.Text.Json;
|
||
using System.Text.Json.Serialization;
|
||
using System.Text.RegularExpressions;
|
||
|
||
namespace JobTrackerApi.Models;
|
||
|
||
public static class StructuredCvProfileJson
|
||
{
|
||
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web)
|
||
{
|
||
PropertyNameCaseInsensitive = true,
|
||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
|
||
};
|
||
|
||
private static readonly HashSet<string> NonLocationTokens = new(StringComparer.OrdinalIgnoreCase)
|
||
{
|
||
"python", "ruby", "sql", "mysql", "postgresql", "postgres", "sqlite", "javascript", "typescript",
|
||
"react", "node", "node.js", "c#", ".net", "asp.net", "java", "azure", "aws", "gcp", "docker",
|
||
"kubernetes", "terraform", "git", "github", "gitlab", "ci/cd", "rest", "graphql", "php", "golang", "go"
|
||
};
|
||
|
||
public static StructuredCvProfile Empty() => Normalize(new StructuredCvProfile());
|
||
|
||
public static StructuredCvProfile Deserialize(string? json)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(json)) return Empty();
|
||
|
||
try
|
||
{
|
||
using var doc = JsonDocument.Parse(json);
|
||
if (doc.RootElement.ValueKind == JsonValueKind.Array)
|
||
{
|
||
var sections = JsonSerializer.Deserialize<List<StructuredCvSection>>(json, SerializerOptions) ?? new List<StructuredCvSection>();
|
||
return FromSections(sections);
|
||
}
|
||
|
||
if (doc.RootElement.ValueKind != JsonValueKind.Object) return Empty();
|
||
var profile = JsonSerializer.Deserialize<StructuredCvProfile>(json, SerializerOptions) ?? new StructuredCvProfile();
|
||
return Normalize(profile);
|
||
}
|
||
catch
|
||
{
|
||
return Empty();
|
||
}
|
||
}
|
||
|
||
public static string Serialize(StructuredCvProfile? profile)
|
||
{
|
||
return JsonSerializer.Serialize(Normalize(profile), SerializerOptions);
|
||
}
|
||
|
||
public static StructuredCvProfile Merge(StructuredCvProfile? preferred, StructuredCvProfile? fallback)
|
||
{
|
||
var primary = Normalize(preferred);
|
||
var secondary = Normalize(fallback);
|
||
|
||
primary.Contact.FullName ??= secondary.Contact.FullName;
|
||
primary.Contact.Headline ??= secondary.Contact.Headline;
|
||
primary.Contact.Email ??= secondary.Contact.Email;
|
||
primary.Contact.Phone ??= secondary.Contact.Phone;
|
||
primary.Contact.Location ??= secondary.Contact.Location;
|
||
primary.Contact.Website ??= secondary.Contact.Website;
|
||
primary.Contact.LinkedIn ??= secondary.Contact.LinkedIn;
|
||
|
||
primary.Summary = primary.Summary.Count == 0
|
||
? secondary.Summary
|
||
: primary.Summary.Concat(secondary.Summary).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||
if (primary.Jobs.Count == 0) primary.Jobs = secondary.Jobs;
|
||
if (primary.Education.Count == 0) primary.Education = secondary.Education;
|
||
if (primary.Certifications.Count == 0) primary.Certifications = secondary.Certifications;
|
||
if (primary.Projects.Count == 0) primary.Projects = secondary.Projects;
|
||
primary.Skills = primary.Skills.Count == 0
|
||
? secondary.Skills
|
||
: primary.Skills.Concat(secondary.Skills).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||
primary.Languages = primary.Languages.Count == 0
|
||
? secondary.Languages
|
||
: primary.Languages
|
||
.Concat(secondary.Languages)
|
||
.GroupBy(language => language.Name ?? string.Empty, StringComparer.OrdinalIgnoreCase)
|
||
.Select(group => group.First())
|
||
.ToList();
|
||
primary.Interests = primary.Interests.Count == 0
|
||
? secondary.Interests
|
||
: primary.Interests.Concat(secondary.Interests).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||
if (primary.OtherSections.Count == 0) primary.OtherSections = secondary.OtherSections;
|
||
if (primary.Sections.Count == 0) primary.Sections = secondary.Sections;
|
||
|
||
foreach (var entry in secondary.Metadata.Fields)
|
||
{
|
||
if (!primary.Metadata.Fields.ContainsKey(entry.Key))
|
||
{
|
||
primary.Metadata.Fields[entry.Key] = entry.Value;
|
||
}
|
||
}
|
||
|
||
return Normalize(primary);
|
||
}
|
||
|
||
public static StructuredCvProfile FromSections(IEnumerable<StructuredCvSection>? sections)
|
||
{
|
||
var normalizedSections = NormalizeSections(sections);
|
||
var profile = new StructuredCvProfile
|
||
{
|
||
Sections = normalizedSections,
|
||
};
|
||
|
||
foreach (var section in normalizedSections)
|
||
{
|
||
switch (section.Name.Trim().ToLowerInvariant())
|
||
{
|
||
case "contact":
|
||
ApplyContact(profile.Contact, section.Content);
|
||
break;
|
||
case "professional summary":
|
||
case "summary":
|
||
profile.Summary = SplitList(section.Content);
|
||
break;
|
||
case "skills":
|
||
case "core skills":
|
||
case "technical skills":
|
||
profile.Skills = SplitList(section.Content);
|
||
break;
|
||
case "languages":
|
||
profile.Languages = ParseLanguages(section.Content);
|
||
break;
|
||
case "interests":
|
||
profile.Interests = SplitList(section.Content);
|
||
break;
|
||
case "work experience":
|
||
case "experience":
|
||
case "employment history":
|
||
profile.Jobs = ParseJobs(section.Content);
|
||
break;
|
||
case "education":
|
||
profile.Education = ParseEducation(section.Content);
|
||
break;
|
||
case "certifications":
|
||
case "certificates":
|
||
profile.Certifications = ParseCertifications(section.Content);
|
||
break;
|
||
case "projects":
|
||
case "selected projects":
|
||
profile.Projects = ParseProjects(section.Content);
|
||
break;
|
||
default:
|
||
profile.OtherSections.Add(new StructuredCvOtherSection
|
||
{
|
||
Title = section.Name,
|
||
Items = SplitList(section.Content),
|
||
});
|
||
break;
|
||
}
|
||
}
|
||
|
||
return Normalize(profile);
|
||
}
|
||
|
||
public static StructuredCvProfile Normalize(StructuredCvProfile? profile)
|
||
{
|
||
profile ??= new StructuredCvProfile();
|
||
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
|
||
profile.Metadata ??= new StructuredCvMetadata();
|
||
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
|
||
profile.Contact = NormalizeContact(profile.Contact);
|
||
profile.Summary = CleanList(profile.Summary);
|
||
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
|
||
.Select(NormalizeJob)
|
||
.Where(job => !string.IsNullOrWhiteSpace(job.Title)
|
||
|| !string.IsNullOrWhiteSpace(job.Company)
|
||
|| job.Bullets.Count > 0)
|
||
.ToList();
|
||
profile.Education = (profile.Education ?? new List<StructuredCvEducation>())
|
||
.Select(NormalizeEducation)
|
||
.Where(education => !string.IsNullOrWhiteSpace(education.Qualification)
|
||
|| !string.IsNullOrWhiteSpace(education.Institution)
|
||
|| education.Details.Count > 0)
|
||
.ToList();
|
||
profile.Certifications = (profile.Certifications ?? new List<StructuredCvCertification>())
|
||
.Select(NormalizeCertification)
|
||
.Where(certification => !string.IsNullOrWhiteSpace(certification.Name)
|
||
|| !string.IsNullOrWhiteSpace(certification.Issuer)
|
||
|| certification.Details.Count > 0)
|
||
.ToList();
|
||
profile.Projects = (profile.Projects ?? new List<StructuredCvProject>())
|
||
.Select(NormalizeProject)
|
||
.Where(project => !string.IsNullOrWhiteSpace(project.Name)
|
||
|| !string.IsNullOrWhiteSpace(project.Role)
|
||
|| project.Bullets.Count > 0)
|
||
.ToList();
|
||
profile.Skills = CleanList(profile.Skills);
|
||
profile.Languages = (profile.Languages ?? new List<StructuredCvLanguage>())
|
||
.Select(NormalizeLanguage)
|
||
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
|
||
.ToList();
|
||
profile.Interests = CleanList(profile.Interests);
|
||
profile.OtherSections = (profile.OtherSections ?? new List<StructuredCvOtherSection>())
|
||
.Select(section => new StructuredCvOtherSection
|
||
{
|
||
Title = TrimOrNull(section?.Title),
|
||
Items = CleanList(section?.Items),
|
||
})
|
||
.Where(section => !string.IsNullOrWhiteSpace(section.Title) || section.Items.Count > 0)
|
||
.ToList();
|
||
|
||
var normalizedSections = NormalizeSections(profile.Sections);
|
||
profile.Sections = normalizedSections.Count > 0 ? normalizedSections : BuildSections(profile);
|
||
return profile;
|
||
}
|
||
|
||
private static StructuredCvContact NormalizeContact(StructuredCvContact? contact)
|
||
{
|
||
contact ??= new StructuredCvContact();
|
||
contact.FullName = TrimOrNull(contact.FullName);
|
||
contact.Headline = TrimOrNull(contact.Headline);
|
||
contact.Email = TrimOrNull(contact.Email);
|
||
contact.Phone = TrimOrNull(contact.Phone);
|
||
contact.Location = NormalizeLocationValue(contact.Location);
|
||
contact.Website = NormalizeWebsite(contact.Website);
|
||
contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn);
|
||
return contact;
|
||
}
|
||
|
||
private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
|
||
{
|
||
job ??= new StructuredCvJob();
|
||
|
||
var title = NormalizeJobTitle(job.Title);
|
||
var company = NormalizeCompanyName(job.Company);
|
||
var location = NormalizeLocationValue(job.Location);
|
||
|
||
if (!string.IsNullOrWhiteSpace(title) && company is null)
|
||
{
|
||
var atSplit = Regex.Match(title, @"^(?<title>.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase);
|
||
if (atSplit.Success)
|
||
{
|
||
title = NormalizeJobTitle(atSplit.Groups["title"].Value);
|
||
company = NormalizeCompanyName(atSplit.Groups["company"].Value);
|
||
}
|
||
}
|
||
|
||
if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company))
|
||
{
|
||
var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title);
|
||
var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company);
|
||
if (titleLooksLikeCompany && companyLooksLikeTitle)
|
||
{
|
||
(title, company) = (company, title);
|
||
}
|
||
}
|
||
|
||
if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title))
|
||
{
|
||
if (company is null) company = title;
|
||
title = null;
|
||
}
|
||
|
||
if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null)
|
||
{
|
||
title = company;
|
||
company = null;
|
||
}
|
||
|
||
job.Title = title;
|
||
job.Company = company;
|
||
job.Location = location;
|
||
job.Start = NormalizeDateValue(job.Start);
|
||
job.End = NormalizeDateValue(job.End);
|
||
job.Bullets = CleanList(job.Bullets)
|
||
.Select(NormalizeBullet)
|
||
.Where(bullet => bullet is not null)
|
||
.Select(bullet => bullet!)
|
||
.Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company))
|
||
.ToList();
|
||
job.Skills = CleanList(job.Skills);
|
||
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||
return job;
|
||
}
|
||
|
||
private static string? NormalizeBullet(string? value)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(value)) return null;
|
||
return value.Trim().TrimStart('-', '•', '*', ' ');
|
||
}
|
||
|
||
private static bool IsUsefulJobBullet(string? value, string? title, string? company)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return false;
|
||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false;
|
||
if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false;
|
||
if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false;
|
||
if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false;
|
||
return true;
|
||
}
|
||
|
||
private static string? NormalizeJobTitle(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return null;
|
||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
|
||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||
}
|
||
|
||
private static string? NormalizeCompanyName(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return null;
|
||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||
if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null;
|
||
if (trimmed.Contains('.') && trimmed.Contains(' ')) return null;
|
||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
|
||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||
}
|
||
|
||
private static string? NormalizeLocationValue(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return null;
|
||
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
|
||
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
|
||
|
||
var normalized = Regex.Replace(trimmed, @"\s+[A-Z](?:\s+[A-Z]){2,}(?:\b.*)?$", string.Empty).Trim();
|
||
normalized = Regex.Replace(normalized, @"\b(?:remote|hybrid)\b.*$", string.Empty, RegexOptions.IgnoreCase).Trim();
|
||
normalized = Regex.Replace(normalized, @"\b(?:sales representative|developer|engineer|manager|consultant|analyst|designer|specialist|technician)\b.*$", string.Empty, RegexOptions.IgnoreCase).Trim();
|
||
normalized = Regex.Replace(normalized, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||
if (parts.Length == 0 || parts.Length > 4) return null;
|
||
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'’\-. ]+$"))) return null;
|
||
if (parts.Any(LooksLikeSkillToken)) return null;
|
||
|
||
return string.Join(", ", parts);
|
||
}
|
||
|
||
private static string? NormalizeWebsite(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return null;
|
||
if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
|
||
|
||
var candidate = trimmed;
|
||
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
|
||
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
|
||
var host = uri.Host.Trim().Trim('.').ToLowerInvariant();
|
||
if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null;
|
||
return host;
|
||
}
|
||
|
||
private static string? NormalizeLinkedIn(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return null;
|
||
|
||
var candidate = trimmed;
|
||
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
|
||
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
|
||
if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
|
||
|
||
var path = uri.AbsolutePath.TrimEnd('/');
|
||
if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null;
|
||
return $"https://www.linkedin.com{path}";
|
||
}
|
||
|
||
private static string? NormalizeDateValue(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null;
|
||
}
|
||
|
||
private static bool LooksLikeDateRange(string value)
|
||
{
|
||
return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-–]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
private static bool LooksLikeUrlOrEmail(string value)
|
||
{
|
||
return value.Contains('@')
|
||
|| value.Contains("www.", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Contains("http://", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Contains("https://", StringComparison.OrdinalIgnoreCase);
|
||
}
|
||
|
||
private static bool LooksLikeSectionHeading(string value)
|
||
{
|
||
return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Experience", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Employment History", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Education", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Skills", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Languages", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Interests", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Contact", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase)
|
||
|| value.Equals("Summary", StringComparison.OrdinalIgnoreCase);
|
||
}
|
||
|
||
private static bool LooksLikeJobTitle(string value)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
|
||
|
||
return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase)
|
||
|| (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value));
|
||
}
|
||
|
||
private static bool LooksLikeCompanyName(string value)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
|
||
|
||
return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase)
|
||
|| value.Contains('&')
|
||
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
|
||
}
|
||
|
||
private static bool LooksLikeSkillToken(string value)
|
||
{
|
||
var normalized = TrimOrNull(value)?.Trim('.', ' ');
|
||
return normalized is not null && NonLocationTokens.Contains(normalized);
|
||
}
|
||
|
||
private static bool LooksLikeQualification(string value)
|
||
{
|
||
return Regex.IsMatch(value, @"\b(level\s*\d+|nvq|btec|gcse|a-?level|diploma|certificate|certification|bachelor(?:'s)?|master(?:'s)?|phd|doctorate|mba|ba|bsc|msc|ma|associate|apprenticeship|degree|ict)\b", RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
private static bool LooksLikeInstitutionName(string value)
|
||
{
|
||
return Regex.IsMatch(value, @"\b(university|college|school|academy|institute|faculty|campus|council|polytechnic)\b", RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
private static string? NormalizeQualification(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return null;
|
||
if (LooksLikeDateRange(trimmed) || LooksLikeUrlOrEmail(trimmed) || LooksLikeSectionHeading(trimmed)) return null;
|
||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||
}
|
||
|
||
private static string? NormalizeInstitution(string? value)
|
||
{
|
||
var trimmed = TrimOrNull(value);
|
||
if (trimmed is null) return null;
|
||
if (LooksLikeDateRange(trimmed) || LooksLikeUrlOrEmail(trimmed) || LooksLikeSectionHeading(trimmed)) return null;
|
||
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
|
||
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
|
||
}
|
||
|
||
private static string? NormalizeQualificationLevel(string? explicitValue, string? qualificationText)
|
||
{
|
||
var candidate = TrimOrNull(explicitValue) ?? TrimOrNull(qualificationText);
|
||
if (candidate is null) return null;
|
||
|
||
if (Regex.IsMatch(candidate, @"\b(phd|doctorate|dphil)\b", RegexOptions.IgnoreCase)) return "PhD";
|
||
if (Regex.IsMatch(candidate, @"\b(master(?:'s)?|msc|m\.sc|ma|m\.a|mba|meng|meng)\b", RegexOptions.IgnoreCase)) return "Master";
|
||
if (Regex.IsMatch(candidate, @"\b(bachelor(?:'s)?|bsc|b\.sc|ba|b\.a|beng|llb|undergraduate degree)\b", RegexOptions.IgnoreCase)) return "Bachelor";
|
||
if (Regex.IsMatch(candidate, @"\b(diploma|certificate|certification|nvq|btec|level\s*\d+|apprenticeship|associate degree)\b", RegexOptions.IgnoreCase)) return "Diploma/Certificate";
|
||
if (Regex.IsMatch(candidate, @"\b(gcse|a-?level|secondary|high school|gymnasium)\b", RegexOptions.IgnoreCase)) return "Secondary";
|
||
return "Other";
|
||
}
|
||
|
||
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
|
||
{
|
||
education ??= new StructuredCvEducation();
|
||
education.Qualification = NormalizeQualification(education.Qualification);
|
||
education.QualificationLevel = NormalizeQualificationLevel(education.QualificationLevel, education.Qualification);
|
||
education.Institution = NormalizeInstitution(education.Institution);
|
||
education.Location = NormalizeLocationValue(education.Location);
|
||
education.Start = NormalizeDateValue(education.Start);
|
||
education.End = NormalizeDateValue(education.End);
|
||
education.Details = CleanList(education.Details);
|
||
|
||
if (!string.IsNullOrWhiteSpace(education.Qualification) && !string.IsNullOrWhiteSpace(education.Institution))
|
||
{
|
||
var qualificationLooksInstitutional = LooksLikeInstitutionName(education.Qualification) && !LooksLikeQualification(education.Qualification);
|
||
var institutionLooksQualification = LooksLikeQualification(education.Institution) && !LooksLikeInstitutionName(education.Institution);
|
||
if (qualificationLooksInstitutional && institutionLooksQualification)
|
||
{
|
||
(education.Qualification, education.Institution) = (education.Institution, education.Qualification);
|
||
education.QualificationLevel = NormalizeQualificationLevel(education.QualificationLevel, education.Qualification);
|
||
}
|
||
}
|
||
|
||
return education;
|
||
}
|
||
|
||
private static StructuredCvCertification NormalizeCertification(StructuredCvCertification? certification)
|
||
{
|
||
certification ??= new StructuredCvCertification();
|
||
certification.Name = NormalizeQualification(certification.Name);
|
||
certification.Issuer = NormalizeInstitution(certification.Issuer);
|
||
certification.Location = NormalizeLocationValue(certification.Location);
|
||
certification.Date = NormalizeDateValue(certification.Date);
|
||
certification.Details = CleanList(certification.Details);
|
||
return certification;
|
||
}
|
||
|
||
private static StructuredCvProject NormalizeProject(StructuredCvProject? project)
|
||
{
|
||
project ??= new StructuredCvProject();
|
||
project.Name = NormalizeQualification(project.Name);
|
||
project.Role = NormalizeJobTitle(project.Role);
|
||
project.Location = NormalizeLocationValue(project.Location);
|
||
project.Start = NormalizeDateValue(project.Start);
|
||
project.End = NormalizeDateValue(project.End);
|
||
project.Bullets = CleanList(project.Bullets)
|
||
.Select(NormalizeBullet)
|
||
.Where(bullet => bullet is not null)
|
||
.Select(bullet => bullet!)
|
||
.ToList();
|
||
project.Skills = CleanList(project.Skills);
|
||
return project;
|
||
}
|
||
|
||
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
|
||
{
|
||
language ??= new StructuredCvLanguage();
|
||
|
||
var originalName = TrimOrNull(language.Name);
|
||
var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName);
|
||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName);
|
||
|
||
language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null;
|
||
language.Level = normalizedLevel;
|
||
language.Notes = TrimOrNull(language.Notes);
|
||
return language;
|
||
}
|
||
|
||
private static List<StructuredCvSection> NormalizeSections(IEnumerable<StructuredCvSection>? sections)
|
||
{
|
||
return (sections ?? Array.Empty<StructuredCvSection>())
|
||
.Select(section => new StructuredCvSection
|
||
{
|
||
Name = string.IsNullOrWhiteSpace(section?.Name) ? "General" : section.Name.Trim(),
|
||
Content = section?.Content?.Trim() ?? string.Empty,
|
||
WordCount = section?.WordCount is > 0 ? section.WordCount : CountWords(section?.Content),
|
||
})
|
||
.Where(section => !string.IsNullOrWhiteSpace(section.Content))
|
||
.ToList();
|
||
}
|
||
|
||
private static List<StructuredCvSection> BuildSections(StructuredCvProfile profile)
|
||
{
|
||
var sections = new List<StructuredCvSection>();
|
||
|
||
var contactLines = new List<string>();
|
||
AddIf(contactLines, profile.Contact.FullName);
|
||
AddIf(contactLines, profile.Contact.Headline);
|
||
AddIf(contactLines, profile.Contact.Email);
|
||
AddIf(contactLines, profile.Contact.Phone);
|
||
AddIf(contactLines, profile.Contact.Location);
|
||
AddIf(contactLines, profile.Contact.Website);
|
||
AddIf(contactLines, profile.Contact.LinkedIn);
|
||
AddSectionIfAny(sections, "Contact", contactLines);
|
||
AddSectionIfAny(sections, "Professional Summary", profile.Summary);
|
||
|
||
if (profile.Jobs.Count > 0)
|
||
{
|
||
var lines = new List<string>();
|
||
foreach (var job in profile.Jobs)
|
||
{
|
||
AddIf(lines, $"### {job.Title}".Trim());
|
||
var meta = string.Join(" | ", new[] { job.Company, job.Location, FormatDateRange(job.Start, job.End, job.IsCurrent) }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||
AddIf(lines, meta);
|
||
lines.AddRange(job.Bullets.Select(bullet => $"- {bullet}"));
|
||
if (job.Skills.Count > 0)
|
||
{
|
||
lines.Add($"Skills: {string.Join(", ", job.Skills)}");
|
||
}
|
||
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
|
||
}
|
||
AddSectionIfAny(sections, "Work Experience", lines);
|
||
}
|
||
|
||
if (profile.Education.Count > 0)
|
||
{
|
||
var lines = new List<string>();
|
||
foreach (var education in profile.Education)
|
||
{
|
||
AddIf(lines, $"### {education.Qualification}".Trim());
|
||
var meta = string.Join(" | ", new[] { education.Institution, education.Location, FormatDateRange(education.Start, education.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||
AddIf(lines, meta);
|
||
if (!string.IsNullOrWhiteSpace(education.QualificationLevel)) AddIf(lines, $"Level: {education.QualificationLevel}");
|
||
lines.AddRange(education.Details.Select(detail => $"- {detail}"));
|
||
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
|
||
}
|
||
AddSectionIfAny(sections, "Education", lines);
|
||
}
|
||
|
||
if (profile.Certifications.Count > 0)
|
||
{
|
||
var lines = new List<string>();
|
||
foreach (var certification in profile.Certifications)
|
||
{
|
||
AddIf(lines, $"### {certification.Name}".Trim());
|
||
var meta = string.Join(" | ", new[] { certification.Issuer, certification.Location, certification.Date }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||
AddIf(lines, meta);
|
||
lines.AddRange(certification.Details.Select(detail => $"- {detail}"));
|
||
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
|
||
}
|
||
AddSectionIfAny(sections, "Certifications", lines);
|
||
}
|
||
|
||
if (profile.Projects.Count > 0)
|
||
{
|
||
var lines = new List<string>();
|
||
foreach (var project in profile.Projects)
|
||
{
|
||
AddIf(lines, $"### {project.Name}".Trim());
|
||
var meta = string.Join(" | ", new[] { project.Role, project.Location, FormatDateRange(project.Start, project.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||
AddIf(lines, meta);
|
||
lines.AddRange(project.Bullets.Select(bullet => $"- {bullet}"));
|
||
if (project.Skills.Count > 0) AddIf(lines, $"Skills: {string.Join(", ", project.Skills)}");
|
||
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
|
||
}
|
||
AddSectionIfAny(sections, "Projects", lines);
|
||
}
|
||
|
||
AddSectionIfAny(sections, "Skills", profile.Skills);
|
||
|
||
if (profile.Languages.Count > 0)
|
||
{
|
||
AddSectionIfAny(sections, "Languages", profile.Languages.Select(language =>
|
||
{
|
||
var value = language.Name ?? string.Empty;
|
||
if (!string.IsNullOrWhiteSpace(language.Level)) value += $": {language.Level}";
|
||
if (!string.IsNullOrWhiteSpace(language.Notes)) value += $" ({language.Notes})";
|
||
return value;
|
||
}).ToList());
|
||
}
|
||
|
||
AddSectionIfAny(sections, "Interests", profile.Interests);
|
||
|
||
foreach (var other in profile.OtherSections)
|
||
{
|
||
AddSectionIfAny(sections, other.Title ?? "Other", other.Items);
|
||
}
|
||
|
||
return NormalizeSections(sections);
|
||
}
|
||
|
||
private static void AddSectionIfAny(List<StructuredCvSection> sections, string name, IEnumerable<string>? lines)
|
||
{
|
||
var content = string.Join("\n", (lines ?? Array.Empty<string>()).Where(line => !string.IsNullOrWhiteSpace(line)).Select(line => line.Trim())).Trim();
|
||
if (string.IsNullOrWhiteSpace(content)) return;
|
||
sections.Add(new StructuredCvSection { Name = name, Content = content, WordCount = CountWords(content) });
|
||
}
|
||
|
||
private static void AddIf(List<string> lines, string? value)
|
||
{
|
||
if (!string.IsNullOrWhiteSpace(value)) lines.Add(value.Trim());
|
||
}
|
||
|
||
private static void ApplyContact(StructuredCvContact contact, string content)
|
||
{
|
||
var lines = content.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||
contact.Email ??= Regex.Match(content, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
|
||
contact.Phone ??= Regex.Match(content, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
|
||
|
||
foreach (var line in lines)
|
||
{
|
||
if (contact.LinkedIn is null && line.Contains("linkedin", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
contact.LinkedIn = line.Trim();
|
||
continue;
|
||
}
|
||
|
||
if (contact.Website is null && (line.Contains("http", StringComparison.OrdinalIgnoreCase) || line.Contains("www.", StringComparison.OrdinalIgnoreCase) || line.Contains(".") && !line.Contains('@')))
|
||
{
|
||
contact.Website = line.Trim();
|
||
}
|
||
}
|
||
|
||
var leftovers = lines.Where(line => !line.Contains('@')
|
||
&& !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase)
|
||
&& !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase)
|
||
&& !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase))
|
||
.ToList();
|
||
|
||
var plausibleName = leftovers.FirstOrDefault(line => LooksLikePersonName(line));
|
||
contact.FullName ??= plausibleName?.Trim();
|
||
contact.FullName ??= GuessNameFromLinkedIn(contact.LinkedIn);
|
||
contact.FullName ??= GuessNameFromEmail(contact.Email);
|
||
|
||
var remaining = leftovers.Where(line => !string.Equals(line, contact.FullName, StringComparison.OrdinalIgnoreCase)).ToList();
|
||
var addressLike = remaining.Where(LooksLikeAddressish).ToList();
|
||
if (remaining.Count > 1 && !LooksLikeAddressish(remaining[0])) contact.Headline ??= remaining[0].Trim();
|
||
contact.Location ??= addressLike.LastOrDefault()?.Trim();
|
||
if (string.IsNullOrWhiteSpace(contact.Location))
|
||
{
|
||
var nonHeadline = remaining.Where(line => !string.Equals(line, contact.Headline, StringComparison.OrdinalIgnoreCase)).ToList();
|
||
contact.Location ??= nonHeadline.LastOrDefault()?.Trim();
|
||
}
|
||
}
|
||
|
||
private static bool LooksLikeAddressish(string value)
|
||
{
|
||
return value.Any(char.IsDigit)
|
||
|| Regex.IsMatch(value, @"\b(street|st\.?|road|rd\.?|avenue|ave\.?|suite|city|london|new york|oslo|uk|ny)\b", RegexOptions.IgnoreCase);
|
||
}
|
||
|
||
private static bool LooksLikePersonName(string value)
|
||
{
|
||
return Regex.IsMatch(value.Trim(), @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,3}$");
|
||
}
|
||
|
||
private static string? GuessNameFromLinkedIn(string? linkedIn)
|
||
{
|
||
var value = TrimOrNull(linkedIn);
|
||
if (value is null) return null;
|
||
var match = Regex.Match(value, @"linkedin\.com/(?:in|pub)/(?<slug>[a-z0-9._-]+)", RegexOptions.IgnoreCase);
|
||
if (!match.Success) return null;
|
||
var parts = Regex.Split(match.Groups["slug"].Value, @"[._-]+")
|
||
.Where(part => !string.IsNullOrWhiteSpace(part) && part.All(ch => char.IsLetter(ch)))
|
||
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
|
||
.ToList();
|
||
return parts.Count >= 2 ? string.Join(" ", parts) : null;
|
||
}
|
||
|
||
private static string? GuessNameFromEmail(string? email)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
|
||
var local = email[..email.IndexOf('@')].Trim();
|
||
if (string.IsNullOrWhiteSpace(local)) return null;
|
||
var parts = Regex.Split(local, @"[._-]+", RegexOptions.None)
|
||
.Where(part => !string.IsNullOrWhiteSpace(part))
|
||
.Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
|
||
.ToList();
|
||
return parts.Count >= 2 ? string.Join(" ", parts) : null;
|
||
}
|
||
|
||
private static List<StructuredCvLanguage> ParseLanguages(string content)
|
||
{
|
||
return SplitList(content)
|
||
.Select(item =>
|
||
{
|
||
var normalized = item.Trim();
|
||
var name = normalized;
|
||
string? level = null;
|
||
string? notes = null;
|
||
|
||
var colonIndex = normalized.IndexOf(':');
|
||
if (colonIndex > 0)
|
||
{
|
||
name = normalized[..colonIndex].Trim();
|
||
var remainder = normalized[(colonIndex + 1)..].Trim();
|
||
var noteMatch = Regex.Match(remainder, @"^(.*?)\s*\((.*?)\)$");
|
||
if (noteMatch.Success)
|
||
{
|
||
level = noteMatch.Groups[1].Value.NullIfWhitespace();
|
||
notes = noteMatch.Groups[2].Value.NullIfWhitespace();
|
||
}
|
||
else
|
||
{
|
||
level = remainder.NullIfWhitespace();
|
||
}
|
||
}
|
||
else
|
||
{
|
||
var dashMatch = Regex.Match(normalized, @"^(?<name>[\p{L}][\p{L}\s-]+?)\s*[–-]\s*(?<level>.+)$");
|
||
if (dashMatch.Success)
|
||
{
|
||
name = dashMatch.Groups["name"].Value.Trim();
|
||
level = dashMatch.Groups["level"].Value.Trim();
|
||
}
|
||
else
|
||
{
|
||
var parenMatch = Regex.Match(normalized, @"^(?<name>[\p{L}][\p{L}\s-]+?)\s*\((?<level>.+)\)$");
|
||
if (parenMatch.Success)
|
||
{
|
||
name = parenMatch.Groups["name"].Value.Trim();
|
||
level = parenMatch.Groups["level"].Value.Trim();
|
||
}
|
||
}
|
||
}
|
||
|
||
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(normalized);
|
||
return new StructuredCvLanguage
|
||
{
|
||
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
|
||
Level = normalizedLevel,
|
||
Notes = notes,
|
||
};
|
||
})
|
||
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
|
||
.ToList();
|
||
}
|
||
|
||
private static List<StructuredCvJob> ParseJobs(string content)
|
||
{
|
||
var blocks = SplitBlocks(content);
|
||
return blocks.Select(ParseJobBlock).Where(job => job is not null).Select(job => job!).ToList();
|
||
}
|
||
|
||
private static StructuredCvJob? ParseJobBlock(string block)
|
||
{
|
||
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
||
if (lines.Count == 0) return null;
|
||
|
||
var job = new StructuredCvJob();
|
||
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
|
||
job.Title = lines[0].NullIfWhitespace();
|
||
|
||
var titleDateMatch = Regex.Match(job.Title ?? string.Empty, @"(?<title>.+?)\s*[-–]\s*(?<start>(?:\d{1,2}/)?\d{4})\s*(?:to|[-–])\s*(?<end>(?:\d{1,2}/)?\d{4}|Present|Current)$", RegexOptions.IgnoreCase);
|
||
if (titleDateMatch.Success)
|
||
{
|
||
job.Title = titleDateMatch.Groups["title"].Value.NullIfWhitespace();
|
||
job.Start = titleDateMatch.Groups["start"].Value.NullIfWhitespace();
|
||
job.End = titleDateMatch.Groups["end"].Value.NullIfWhitespace();
|
||
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||
}
|
||
|
||
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
|
||
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\d{1,2}/)?\d{4}|Present|Current)(?:\s*(?:[-–]|to)\s*(?:(?:\d{1,2}/)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
|
||
if (!string.IsNullOrWhiteSpace(dateValue) && string.IsNullOrWhiteSpace(job.Start))
|
||
{
|
||
var parts = Regex.Split(dateValue, "\\s*(?:[-–]|to)\\s*");
|
||
job.Start = parts.FirstOrDefault().NullIfWhitespace();
|
||
job.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
|
||
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
|
||
}
|
||
|
||
var metadataWithoutDates = metadata
|
||
.Select(line => string.IsNullOrWhiteSpace(dateValue) ? line : line.Replace(dateValue, string.Empty))
|
||
.Select(line => line.Trim(' ', '|', ',', '-'))
|
||
.Where(line => !string.IsNullOrWhiteSpace(line))
|
||
.ToList();
|
||
if (metadataWithoutDates.Count > 0) job.Company = metadataWithoutDates[0].NullIfWhitespace();
|
||
if (metadataWithoutDates.Count > 1) job.Location = metadataWithoutDates[1].NullIfWhitespace();
|
||
|
||
job.Bullets = lines.Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
|
||
job.Skills = lines
|
||
.Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase))
|
||
.SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..]))
|
||
.ToList();
|
||
if (job.Skills.Count == 0)
|
||
{
|
||
job.Skills = job.Bullets
|
||
.SelectMany(ExtractSkillsFromBullet)
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
return string.IsNullOrWhiteSpace(job.Title) && string.IsNullOrWhiteSpace(job.Company) && job.Bullets.Count == 0 ? null : job;
|
||
}
|
||
|
||
private static IEnumerable<string> ExtractSkillsFromBullet(string bullet)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(bullet)) yield break;
|
||
|
||
var usingMatch = Regex.Match(bullet, @"\b(?:using|including|with|technologies?:|tools?:)\s+(?<skills>.+)$", RegexOptions.IgnoreCase);
|
||
if (usingMatch.Success)
|
||
{
|
||
foreach (var item in SplitList(usingMatch.Groups["skills"].Value))
|
||
{
|
||
var trimmed = item.Trim().TrimEnd('.');
|
||
if (trimmed.Length >= 2 && trimmed.Length <= 40) yield return trimmed;
|
||
}
|
||
}
|
||
}
|
||
|
||
private static List<StructuredCvEducation> ParseEducation(string content)
|
||
{
|
||
var blocks = SplitBlocks(content);
|
||
return blocks.Select(ParseEducationBlock).Where(education => education is not null).Select(education => education!).ToList();
|
||
}
|
||
|
||
private static StructuredCvEducation? ParseEducationBlock(string block)
|
||
{
|
||
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
||
if (lines.Count == 0) return null;
|
||
|
||
var education = new StructuredCvEducation();
|
||
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
|
||
education.Qualification = lines[0].NullIfWhitespace();
|
||
|
||
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
|
||
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4})(?:\s*[-–]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
|
||
if (!string.IsNullOrWhiteSpace(dateValue))
|
||
{
|
||
var parts = Regex.Split(dateValue, "\\s*[-–]\\s*");
|
||
education.Start = parts.FirstOrDefault().NullIfWhitespace();
|
||
education.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
|
||
}
|
||
|
||
var metadataWithoutDates = metadata
|
||
.Select(line => string.IsNullOrWhiteSpace(dateValue) ? line : line.Replace(dateValue, string.Empty))
|
||
.Select(line => line.Trim(' ', '|', ',', '-'))
|
||
.Where(line => !string.IsNullOrWhiteSpace(line))
|
||
.ToList();
|
||
if (metadataWithoutDates.Count > 0) education.Institution = metadataWithoutDates[0].NullIfWhitespace();
|
||
if (metadataWithoutDates.Count > 1) education.Location = metadataWithoutDates[1].NullIfWhitespace();
|
||
|
||
education.Details = lines.Skip(1).Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
|
||
education.QualificationLevel = NormalizeQualificationLevel(null, education.Qualification);
|
||
return string.IsNullOrWhiteSpace(education.Qualification) && string.IsNullOrWhiteSpace(education.Institution) && education.Details.Count == 0 ? null : education;
|
||
}
|
||
|
||
private static List<StructuredCvCertification> ParseCertifications(string content)
|
||
{
|
||
var blocks = SplitBlocks(content);
|
||
return blocks.Select(ParseCertificationBlock).Where(certification => certification is not null).Select(certification => certification!).ToList();
|
||
}
|
||
|
||
private static StructuredCvCertification? ParseCertificationBlock(string block)
|
||
{
|
||
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
||
if (lines.Count == 0) return null;
|
||
|
||
var certification = new StructuredCvCertification();
|
||
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
|
||
certification.Name = lines[0].NullIfWhitespace();
|
||
|
||
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
|
||
certification.Date = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
|
||
var metadataWithoutDates = metadata
|
||
.Select(line => string.IsNullOrWhiteSpace(certification.Date) ? line : line.Replace(certification.Date, string.Empty))
|
||
.Select(line => line.Trim(' ', '|', ',', '-'))
|
||
.Where(line => !string.IsNullOrWhiteSpace(line))
|
||
.ToList();
|
||
if (metadataWithoutDates.Count > 0) certification.Issuer = metadataWithoutDates[0].NullIfWhitespace();
|
||
if (metadataWithoutDates.Count > 1) certification.Location = metadataWithoutDates[1].NullIfWhitespace();
|
||
certification.Details = lines.Skip(1).Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
|
||
return string.IsNullOrWhiteSpace(certification.Name) && string.IsNullOrWhiteSpace(certification.Issuer) ? null : certification;
|
||
}
|
||
|
||
private static List<StructuredCvProject> ParseProjects(string content)
|
||
{
|
||
var blocks = SplitBlocks(content);
|
||
return blocks.Select(ParseProjectBlock).Where(project => project is not null).Select(project => project!).ToList();
|
||
}
|
||
|
||
private static StructuredCvProject? ParseProjectBlock(string block)
|
||
{
|
||
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
||
if (lines.Count == 0) return null;
|
||
|
||
var project = new StructuredCvProject();
|
||
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
|
||
project.Name = lines[0].NullIfWhitespace();
|
||
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line) && !line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)).ToList();
|
||
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)(?:\s*[-–]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
|
||
if (!string.IsNullOrWhiteSpace(dateValue))
|
||
{
|
||
var parts = Regex.Split(dateValue, "\\s*[-–]\\s*");
|
||
project.Start = parts.FirstOrDefault().NullIfWhitespace();
|
||
project.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
|
||
}
|
||
|
||
var metadataWithoutDates = metadata
|
||
.Select(line => string.IsNullOrWhiteSpace(dateValue) ? line : line.Replace(dateValue, string.Empty))
|
||
.Select(line => line.Trim(' ', '|', ',', '-'))
|
||
.Where(line => !string.IsNullOrWhiteSpace(line))
|
||
.ToList();
|
||
if (metadataWithoutDates.Count > 0) project.Role = metadataWithoutDates[0].NullIfWhitespace();
|
||
if (metadataWithoutDates.Count > 1) project.Location = metadataWithoutDates[1].NullIfWhitespace();
|
||
project.Bullets = lines.Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
|
||
project.Skills = lines
|
||
.Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase))
|
||
.SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..]))
|
||
.ToList();
|
||
return string.IsNullOrWhiteSpace(project.Name) && string.IsNullOrWhiteSpace(project.Role) && project.Bullets.Count == 0 ? null : project;
|
||
}
|
||
|
||
private static List<string> SplitBlocks(string content)
|
||
{
|
||
var normalized = content.Replace("\r\n", "\n").Trim();
|
||
if (string.IsNullOrWhiteSpace(normalized)) return new List<string>();
|
||
|
||
if (normalized.Contains("### ", StringComparison.Ordinal))
|
||
{
|
||
return Regex.Split(normalized, @"(?=^###\s+)" , RegexOptions.Multiline)
|
||
.Select(block => block.Trim())
|
||
.Where(block => !string.IsNullOrWhiteSpace(block))
|
||
.ToList();
|
||
}
|
||
|
||
return Regex.Split(normalized, @"\n\s*\n")
|
||
.Select(block => block.Trim())
|
||
.Where(block => !string.IsNullOrWhiteSpace(block))
|
||
.ToList();
|
||
}
|
||
|
||
private static bool IsBullet(string value)
|
||
{
|
||
var trimmed = value.TrimStart();
|
||
return trimmed.StartsWith("-", StringComparison.Ordinal)
|
||
|| trimmed.StartsWith("•", StringComparison.Ordinal)
|
||
|| trimmed.StartsWith("*", StringComparison.Ordinal);
|
||
}
|
||
|
||
private static List<string> SplitList(string? content)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(content)) return new List<string>();
|
||
|
||
return content
|
||
.Replace("\r\n", "\n")
|
||
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
|
||
.SelectMany(line => line.Contains(',') && !line.TrimStart().StartsWith("-", StringComparison.Ordinal)
|
||
? line.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
|
||
: new[] { line })
|
||
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
|
||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
private static List<string> CleanList(IEnumerable<string>? values)
|
||
{
|
||
return (values ?? Array.Empty<string>())
|
||
.Select(value => value?.Trim() ?? string.Empty)
|
||
.Where(value => !string.IsNullOrWhiteSpace(value))
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToList();
|
||
}
|
||
|
||
private static int CountWords(string? content)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(content)) return 0;
|
||
return content.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
|
||
}
|
||
|
||
private static string? TrimOrNull(string? value)
|
||
{
|
||
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
|
||
}
|
||
|
||
private static string? FormatDateRange(string? start, string? end, bool isCurrent)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
|
||
if (string.IsNullOrWhiteSpace(start)) return end;
|
||
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
|
||
}
|
||
|
||
private static string? NullIfWhitespace(this string? value)
|
||
{
|
||
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
|
||
}
|
||
}
|