Files
jobtrackingapp/Models/StructuredCvProfileJson.cs
T

711 lines
33 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;
namespace JobTrackerApi.Models;
public static class StructuredCvProfileJson
{
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web)
{
PropertyNameCaseInsensitive = true,
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
};
public static StructuredCvProfile Empty() => Normalize(new StructuredCvProfile());
public static StructuredCvProfile Deserialize(string? json)
{
if (string.IsNullOrWhiteSpace(json)) return Empty();
try
{
using var doc = JsonDocument.Parse(json);
if (doc.RootElement.ValueKind == JsonValueKind.Array)
{
var sections = JsonSerializer.Deserialize<List<StructuredCvSection>>(json, SerializerOptions) ?? new List<StructuredCvSection>();
return FromSections(sections);
}
if (doc.RootElement.ValueKind != JsonValueKind.Object) return Empty();
var profile = JsonSerializer.Deserialize<StructuredCvProfile>(json, SerializerOptions) ?? new StructuredCvProfile();
return Normalize(profile);
}
catch
{
return Empty();
}
}
public static string Serialize(StructuredCvProfile? profile)
{
return JsonSerializer.Serialize(Normalize(profile), SerializerOptions);
}
public static StructuredCvProfile Merge(StructuredCvProfile? preferred, StructuredCvProfile? fallback)
{
var primary = Normalize(preferred);
var secondary = Normalize(fallback);
primary.Contact.FullName ??= secondary.Contact.FullName;
primary.Contact.Headline ??= secondary.Contact.Headline;
primary.Contact.Email ??= secondary.Contact.Email;
primary.Contact.Phone ??= secondary.Contact.Phone;
primary.Contact.Location ??= secondary.Contact.Location;
primary.Contact.Website ??= secondary.Contact.Website;
primary.Contact.LinkedIn ??= secondary.Contact.LinkedIn;
primary.Summary = primary.Summary.Count == 0
? secondary.Summary
: primary.Summary.Concat(secondary.Summary).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
if (primary.Jobs.Count == 0) primary.Jobs = secondary.Jobs;
if (primary.Education.Count == 0) primary.Education = secondary.Education;
primary.Skills = primary.Skills.Count == 0
? secondary.Skills
: primary.Skills.Concat(secondary.Skills).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
primary.Languages = primary.Languages.Count == 0
? secondary.Languages
: primary.Languages
.Concat(secondary.Languages)
.GroupBy(language => language.Name ?? string.Empty, StringComparer.OrdinalIgnoreCase)
.Select(group => group.First())
.ToList();
primary.Interests = primary.Interests.Count == 0
? secondary.Interests
: primary.Interests.Concat(secondary.Interests).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
if (primary.OtherSections.Count == 0) primary.OtherSections = secondary.OtherSections;
if (primary.Sections.Count == 0) primary.Sections = secondary.Sections;
foreach (var entry in secondary.Metadata.Fields)
{
if (!primary.Metadata.Fields.ContainsKey(entry.Key))
{
primary.Metadata.Fields[entry.Key] = entry.Value;
}
}
return Normalize(primary);
}
public static StructuredCvProfile FromSections(IEnumerable<StructuredCvSection>? sections)
{
var normalizedSections = NormalizeSections(sections);
var profile = new StructuredCvProfile
{
Sections = normalizedSections,
};
foreach (var section in normalizedSections)
{
switch (section.Name.Trim().ToLowerInvariant())
{
case "contact":
ApplyContact(profile.Contact, section.Content);
break;
case "professional summary":
case "summary":
profile.Summary = SplitList(section.Content);
break;
case "skills":
case "core skills":
case "technical skills":
profile.Skills = SplitList(section.Content);
break;
case "languages":
profile.Languages = ParseLanguages(section.Content);
break;
case "interests":
profile.Interests = SplitList(section.Content);
break;
case "work experience":
case "experience":
case "employment history":
profile.Jobs = ParseJobs(section.Content);
break;
case "education":
profile.Education = ParseEducation(section.Content);
break;
default:
profile.OtherSections.Add(new StructuredCvOtherSection
{
Title = section.Name,
Items = SplitList(section.Content),
});
break;
}
}
return Normalize(profile);
}
public static StructuredCvProfile Normalize(StructuredCvProfile? profile)
{
profile ??= new StructuredCvProfile();
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
profile.Metadata ??= new StructuredCvMetadata();
profile.Metadata.Fields ??= new Dictionary<string, StructuredCvFieldMetadata>();
profile.Contact = NormalizeContact(profile.Contact);
profile.Summary = CleanList(profile.Summary);
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
.Select(NormalizeJob)
.Where(job => !string.IsNullOrWhiteSpace(job.Title)
|| !string.IsNullOrWhiteSpace(job.Company)
|| job.Bullets.Count > 0)
.ToList();
profile.Education = (profile.Education ?? new List<StructuredCvEducation>())
.Select(NormalizeEducation)
.Where(education => !string.IsNullOrWhiteSpace(education.Qualification)
|| !string.IsNullOrWhiteSpace(education.Institution)
|| education.Details.Count > 0)
.ToList();
profile.Skills = CleanList(profile.Skills);
profile.Languages = (profile.Languages ?? new List<StructuredCvLanguage>())
.Select(NormalizeLanguage)
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList();
profile.Interests = CleanList(profile.Interests);
profile.OtherSections = (profile.OtherSections ?? new List<StructuredCvOtherSection>())
.Select(section => new StructuredCvOtherSection
{
Title = TrimOrNull(section?.Title),
Items = CleanList(section?.Items),
})
.Where(section => !string.IsNullOrWhiteSpace(section.Title) || section.Items.Count > 0)
.ToList();
var normalizedSections = NormalizeSections(profile.Sections);
profile.Sections = normalizedSections.Count > 0 ? normalizedSections : BuildSections(profile);
return profile;
}
private static StructuredCvContact NormalizeContact(StructuredCvContact? contact)
{
contact ??= new StructuredCvContact();
contact.FullName = TrimOrNull(contact.FullName);
contact.Headline = TrimOrNull(contact.Headline);
contact.Email = TrimOrNull(contact.Email);
contact.Phone = TrimOrNull(contact.Phone);
contact.Location = NormalizeLocationValue(contact.Location);
contact.Website = NormalizeWebsite(contact.Website);
contact.LinkedIn = NormalizeLinkedIn(contact.LinkedIn);
return contact;
}
private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
{
job ??= new StructuredCvJob();
var title = NormalizeJobTitle(job.Title);
var company = NormalizeCompanyName(job.Company);
var location = NormalizeLocationValue(job.Location);
if (!string.IsNullOrWhiteSpace(title) && company is null)
{
var atSplit = Regex.Match(title, @"^(?<title>.+?)\s+at\s+(?<company>.+)$", RegexOptions.IgnoreCase);
if (atSplit.Success)
{
title = NormalizeJobTitle(atSplit.Groups["title"].Value);
company = NormalizeCompanyName(atSplit.Groups["company"].Value);
}
}
if (!string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(company))
{
var titleLooksLikeCompany = LooksLikeCompanyName(title) && !LooksLikeJobTitle(title);
var companyLooksLikeTitle = LooksLikeJobTitle(company) && !LooksLikeCompanyName(company);
if (titleLooksLikeCompany && companyLooksLikeTitle)
{
(title, company) = (company, title);
}
}
if (!string.IsNullOrWhiteSpace(title) && !LooksLikeJobTitle(title) && LooksLikeCompanyName(title))
{
if (company is null) company = title;
title = null;
}
if (!string.IsNullOrWhiteSpace(company) && !LooksLikeCompanyName(company) && LooksLikeJobTitle(company) && title is null)
{
title = company;
company = null;
}
job.Title = title;
job.Company = company;
job.Location = location;
job.Start = NormalizeDateValue(job.Start);
job.End = NormalizeDateValue(job.End);
job.Bullets = CleanList(job.Bullets)
.Select(NormalizeBullet)
.Where(bullet => bullet is not null)
.Select(bullet => bullet!)
.Where(bullet => IsUsefulJobBullet(bullet, job.Title, job.Company))
.ToList();
job.Skills = CleanList(job.Skills);
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
return job;
}
private static string? NormalizeBullet(string? value)
{
if (string.IsNullOrWhiteSpace(value)) return null;
return value.Trim().TrimStart('-', '•', '*', ' ');
}
private static bool IsUsefulJobBullet(string? value, string? title, string? company)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return false;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return false;
if (title is not null && trimmed.Equals(title, StringComparison.OrdinalIgnoreCase)) return false;
if (company is not null && trimmed.Equals(company, StringComparison.OrdinalIgnoreCase)) return false;
if (trimmed.Length < 12 && !trimmed.Contains(' ')) return false;
return true;
}
private static string? NormalizeJobTitle(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static string? NormalizeCompanyName(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
if (trimmed.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase)) return null;
if (trimmed.Contains('.') && trimmed.Contains(' ')) return null;
trimmed = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ',', '-', ':');
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
private static string? NormalizeLocationValue(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (LooksLikeDateRange(trimmed) || LooksLikeSectionHeading(trimmed) || LooksLikeUrlOrEmail(trimmed)) return null;
if (trimmed.Any(char.IsDigit) || trimmed.Length > 80) return null;
var normalized = Regex.Replace(trimmed, @"\s+", " ").Trim(' ', '|', ';', ':');
var parts = normalized.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
if (parts.Length == 0 || parts.Length > 4) return null;
if (parts.Any(part => !Regex.IsMatch(part, @"^[\p{L}][\p{L}'\-. ]+$"))) return null;
return string.Join(", ", parts);
}
private static string? NormalizeWebsite(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
if (trimmed.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
var candidate = trimmed;
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
var host = uri.Host.Trim().Trim('.').ToLowerInvariant();
if (string.IsNullOrWhiteSpace(host) || !Regex.IsMatch(host, @"^(?:[a-z0-9-]+\.)+[a-z]{2,}$", RegexOptions.IgnoreCase)) return null;
return host;
}
private static string? NormalizeLinkedIn(string? value)
{
var trimmed = TrimOrNull(value);
if (trimmed is null) return null;
var candidate = trimmed;
if (!candidate.Contains("://", StringComparison.Ordinal)) candidate = $"https://{candidate}";
if (!Uri.TryCreate(candidate, UriKind.Absolute, out var uri)) return null;
if (!uri.Host.Contains("linkedin.com", StringComparison.OrdinalIgnoreCase)) return null;
var path = uri.AbsolutePath.TrimEnd('/');
if (!Regex.IsMatch(path, @"^/(in|pub)/[^/]+(?:/[^/]+){0,2}$", RegexOptions.IgnoreCase)) return null;
return $"https://www.linkedin.com{path}";
}
private static string? NormalizeDateValue(string? value)
{
var trimmed = TrimOrNull(value);
return trimmed is not null && LooksLikeDateRange(trimmed) ? trimmed : null;
}
private static bool LooksLikeDateRange(string value)
{
return Regex.IsMatch(value, @"^(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current)(?:\s*[-]\s*(?:\d{1,2}/\d{1,2}/\d{4}|(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{4}|\d{4}|Present|Current))?$", RegexOptions.IgnoreCase);
}
private static bool LooksLikeUrlOrEmail(string value)
{
return value.Contains('@')
|| value.Contains("www.", StringComparison.OrdinalIgnoreCase)
|| value.Contains("http://", StringComparison.OrdinalIgnoreCase)
|| value.Contains("https://", StringComparison.OrdinalIgnoreCase);
}
private static bool LooksLikeSectionHeading(string value)
{
return value.Equals("Work Experience", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Experience", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Employment History", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Education", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Skills", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Languages", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Interests", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Contact", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Professional Summary", StringComparison.OrdinalIgnoreCase)
|| value.Equals("Summary", StringComparison.OrdinalIgnoreCase);
}
private static bool LooksLikeJobTitle(string value)
{
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
return Regex.IsMatch(value, @"\b(developer|engineer|manager|lead|architect|consultant|specialist|analyst|administrator|coordinator|director|designer|intern|officer|owner|founder|teacher|researcher|writer|editor|producer|assistant|technician|supervisor|head)\b", RegexOptions.IgnoreCase)
|| (value.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length <= 6 && !LooksLikeCompanyName(value));
}
private static bool LooksLikeCompanyName(string value)
{
if (string.IsNullOrWhiteSpace(value) || LooksLikeDateRange(value) || LooksLikeUrlOrEmail(value)) return false;
return Regex.IsMatch(value, @"\b(inc|llc|ltd|limited|plc|corp|corporation|company|group|university|college|council|municipality|kommune|bank|studio|agency|institute|hospital|school|technologies|technology|systems|solutions|consulting|consultants|partners|foundation|ministry|government)\b", RegexOptions.IgnoreCase)
|| value.Contains('&')
|| Regex.IsMatch(value, @"\b[A-Z]{2,}\b");
}
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
{
education ??= new StructuredCvEducation();
education.Qualification = TrimOrNull(education.Qualification);
education.Institution = TrimOrNull(education.Institution);
education.Location = TrimOrNull(education.Location);
education.Start = TrimOrNull(education.Start);
education.End = TrimOrNull(education.End);
education.Details = CleanList(education.Details);
return education;
}
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
{
language ??= new StructuredCvLanguage();
var originalName = TrimOrNull(language.Name);
var normalizedName = HumanLanguageCatalog.NormalizeLanguageName(originalName);
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(language.Level) ?? HumanLanguageCatalog.ExtractLevel(originalName);
language.Name = normalizedName is not null && normalizedLevel is not null ? normalizedName : null;
language.Level = normalizedLevel;
language.Notes = TrimOrNull(language.Notes);
return language;
}
private static List<StructuredCvSection> NormalizeSections(IEnumerable<StructuredCvSection>? sections)
{
return (sections ?? Array.Empty<StructuredCvSection>())
.Select(section => new StructuredCvSection
{
Name = string.IsNullOrWhiteSpace(section?.Name) ? "General" : section.Name.Trim(),
Content = section?.Content?.Trim() ?? string.Empty,
WordCount = section?.WordCount is > 0 ? section.WordCount : CountWords(section?.Content),
})
.Where(section => !string.IsNullOrWhiteSpace(section.Content))
.ToList();
}
private static List<StructuredCvSection> BuildSections(StructuredCvProfile profile)
{
var sections = new List<StructuredCvSection>();
var contactLines = new List<string>();
AddIf(contactLines, profile.Contact.FullName);
AddIf(contactLines, profile.Contact.Headline);
AddIf(contactLines, profile.Contact.Email);
AddIf(contactLines, profile.Contact.Phone);
AddIf(contactLines, profile.Contact.Location);
AddIf(contactLines, profile.Contact.Website);
AddIf(contactLines, profile.Contact.LinkedIn);
AddSectionIfAny(sections, "Contact", contactLines);
AddSectionIfAny(sections, "Professional Summary", profile.Summary);
if (profile.Jobs.Count > 0)
{
var lines = new List<string>();
foreach (var job in profile.Jobs)
{
AddIf(lines, $"### {job.Title}".Trim());
var meta = string.Join(" | ", new[] { job.Company, job.Location, FormatDateRange(job.Start, job.End, job.IsCurrent) }.Where(value => !string.IsNullOrWhiteSpace(value)));
AddIf(lines, meta);
lines.AddRange(job.Bullets.Select(bullet => $"- {bullet}"));
if (job.Skills.Count > 0)
{
lines.Add($"Skills: {string.Join(", ", job.Skills)}");
}
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
}
AddSectionIfAny(sections, "Work Experience", lines);
}
if (profile.Education.Count > 0)
{
var lines = new List<string>();
foreach (var education in profile.Education)
{
AddIf(lines, $"### {education.Qualification}".Trim());
var meta = string.Join(" | ", new[] { education.Institution, education.Location, FormatDateRange(education.Start, education.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value)));
AddIf(lines, meta);
lines.AddRange(education.Details.Select(detail => $"- {detail}"));
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
}
AddSectionIfAny(sections, "Education", lines);
}
AddSectionIfAny(sections, "Skills", profile.Skills);
if (profile.Languages.Count > 0)
{
AddSectionIfAny(sections, "Languages", profile.Languages.Select(language =>
{
var value = language.Name ?? string.Empty;
if (!string.IsNullOrWhiteSpace(language.Level)) value += $": {language.Level}";
if (!string.IsNullOrWhiteSpace(language.Notes)) value += $" ({language.Notes})";
return value;
}).ToList());
}
AddSectionIfAny(sections, "Interests", profile.Interests);
foreach (var other in profile.OtherSections)
{
AddSectionIfAny(sections, other.Title ?? "Other", other.Items);
}
return NormalizeSections(sections);
}
private static void AddSectionIfAny(List<StructuredCvSection> sections, string name, IEnumerable<string>? lines)
{
var content = string.Join("\n", (lines ?? Array.Empty<string>()).Where(line => !string.IsNullOrWhiteSpace(line)).Select(line => line.Trim())).Trim();
if (string.IsNullOrWhiteSpace(content)) return;
sections.Add(new StructuredCvSection { Name = name, Content = content, WordCount = CountWords(content) });
}
private static void AddIf(List<string> lines, string? value)
{
if (!string.IsNullOrWhiteSpace(value)) lines.Add(value.Trim());
}
private static void ApplyContact(StructuredCvContact contact, string content)
{
var lines = content.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
contact.Email ??= Regex.Match(content, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
contact.Phone ??= Regex.Match(content, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
foreach (var line in lines)
{
if (contact.LinkedIn is null && line.Contains("linkedin", StringComparison.OrdinalIgnoreCase))
{
contact.LinkedIn = line.Trim();
continue;
}
if (contact.Website is null && (line.Contains("http", StringComparison.OrdinalIgnoreCase) || line.Contains("www.", StringComparison.OrdinalIgnoreCase) || line.Contains(".") && !line.Contains('@')))
{
contact.Website = line.Trim();
}
}
var leftovers = lines.Where(line => !line.Contains('@') && !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase)).ToList();
if (leftovers.Count > 0) contact.FullName ??= leftovers[0].Trim();
if (leftovers.Count > 1) contact.Headline ??= leftovers[1].Trim();
if (leftovers.Count > 2) contact.Location ??= leftovers[2].Trim();
}
private static List<StructuredCvLanguage> ParseLanguages(string content)
{
return SplitList(content)
.Select(item =>
{
var name = item;
string? level = null;
string? notes = null;
var colonIndex = item.IndexOf(':');
if (colonIndex > 0)
{
name = item[..colonIndex].Trim();
var remainder = item[(colonIndex + 1)..].Trim();
var noteMatch = Regex.Match(remainder, @"^(.*?)\s*\((.*?)\)$");
if (noteMatch.Success)
{
level = noteMatch.Groups[1].Value.NullIfWhitespace();
notes = noteMatch.Groups[2].Value.NullIfWhitespace();
}
else
{
level = remainder.NullIfWhitespace();
}
}
var normalizedLevel = HumanLanguageCatalog.ExtractLevel(level) ?? HumanLanguageCatalog.ExtractLevel(item);
return new StructuredCvLanguage
{
Name = normalizedLevel is not null ? HumanLanguageCatalog.NormalizeLanguageName(name) : null,
Level = normalizedLevel,
Notes = notes,
};
})
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList();
}
private static List<StructuredCvJob> ParseJobs(string content)
{
var blocks = SplitBlocks(content);
return blocks.Select(ParseJobBlock).Where(job => job is not null).Select(job => job!).ToList();
}
private static StructuredCvJob? ParseJobBlock(string block)
{
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) return null;
var job = new StructuredCvJob();
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
job.Title = lines[0].NullIfWhitespace();
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)(?:\s*[-]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue))
{
var parts = Regex.Split(dateValue, "\\s*[-]\\s*");
job.Start = parts.FirstOrDefault().NullIfWhitespace();
job.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
if (metadataWithoutDates.Count > 0) job.Company = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) job.Location = metadataWithoutDates[1].NullIfWhitespace();
job.Bullets = lines.Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
job.Skills = lines
.Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase))
.SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..]))
.ToList();
return string.IsNullOrWhiteSpace(job.Title) && string.IsNullOrWhiteSpace(job.Company) && job.Bullets.Count == 0 ? null : job;
}
private static List<StructuredCvEducation> ParseEducation(string content)
{
var blocks = SplitBlocks(content);
return blocks.Select(ParseEducationBlock).Where(education => education is not null).Select(education => education!).ToList();
}
private static StructuredCvEducation? ParseEducationBlock(string block)
{
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) return null;
var education = new StructuredCvEducation();
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
education.Qualification = lines[0].NullIfWhitespace();
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4})(?:\s*[-]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue))
{
var parts = Regex.Split(dateValue, "\\s*[-]\\s*");
education.Start = parts.FirstOrDefault().NullIfWhitespace();
education.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
if (metadataWithoutDates.Count > 0) education.Institution = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) education.Location = metadataWithoutDates[1].NullIfWhitespace();
education.Details = lines.Skip(1).Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
return string.IsNullOrWhiteSpace(education.Qualification) && string.IsNullOrWhiteSpace(education.Institution) && education.Details.Count == 0 ? null : education;
}
private static List<string> SplitBlocks(string content)
{
var normalized = content.Replace("\r\n", "\n").Trim();
if (string.IsNullOrWhiteSpace(normalized)) return new List<string>();
if (normalized.Contains("### ", StringComparison.Ordinal))
{
return Regex.Split(normalized, @"(?=^###\s+)" , RegexOptions.Multiline)
.Select(block => block.Trim())
.Where(block => !string.IsNullOrWhiteSpace(block))
.ToList();
}
return Regex.Split(normalized, @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => !string.IsNullOrWhiteSpace(block))
.ToList();
}
private static bool IsBullet(string value)
{
var trimmed = value.TrimStart();
return trimmed.StartsWith("-", StringComparison.Ordinal)
|| trimmed.StartsWith("•", StringComparison.Ordinal)
|| trimmed.StartsWith("*", StringComparison.Ordinal);
}
private static List<string> SplitList(string? content)
{
if (string.IsNullOrWhiteSpace(content)) return new List<string>();
return content
.Replace("\r\n", "\n")
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.SelectMany(line => line.Contains(',') && !line.TrimStart().StartsWith("-", StringComparison.Ordinal)
? line.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
: new[] { line })
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
.Where(item => !string.IsNullOrWhiteSpace(item))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static List<string> CleanList(IEnumerable<string>? values)
{
return (values ?? Array.Empty<string>())
.Select(value => value?.Trim() ?? string.Empty)
.Where(value => !string.IsNullOrWhiteSpace(value))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static int CountWords(string? content)
{
if (string.IsNullOrWhiteSpace(content)) return 0;
return content.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
}
private static string? TrimOrNull(string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
private static string? FormatDateRange(string? start, string? end, bool isCurrent)
{
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
if (string.IsNullOrWhiteSpace(start)) return end;
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
}
private static string? NullIfWhitespace(this string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
}