Files
jobtrackingapp/Models/StructuredCvProfileJson.cs
T

504 lines
23 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;
namespace JobTrackerApi.Models;
public static class StructuredCvProfileJson
{
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web)
{
PropertyNameCaseInsensitive = true,
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
};
public static StructuredCvProfile Empty() => Normalize(new StructuredCvProfile());
public static StructuredCvProfile Deserialize(string? json)
{
if (string.IsNullOrWhiteSpace(json)) return Empty();
try
{
using var doc = JsonDocument.Parse(json);
if (doc.RootElement.ValueKind == JsonValueKind.Array)
{
var sections = JsonSerializer.Deserialize<List<StructuredCvSection>>(json, SerializerOptions) ?? new List<StructuredCvSection>();
return FromSections(sections);
}
if (doc.RootElement.ValueKind != JsonValueKind.Object) return Empty();
var profile = JsonSerializer.Deserialize<StructuredCvProfile>(json, SerializerOptions) ?? new StructuredCvProfile();
return Normalize(profile);
}
catch
{
return Empty();
}
}
public static string Serialize(StructuredCvProfile? profile)
{
return JsonSerializer.Serialize(Normalize(profile), SerializerOptions);
}
public static StructuredCvProfile Merge(StructuredCvProfile? preferred, StructuredCvProfile? fallback)
{
var primary = Normalize(preferred);
var secondary = Normalize(fallback);
primary.Contact.FullName ??= secondary.Contact.FullName;
primary.Contact.Headline ??= secondary.Contact.Headline;
primary.Contact.Email ??= secondary.Contact.Email;
primary.Contact.Phone ??= secondary.Contact.Phone;
primary.Contact.Location ??= secondary.Contact.Location;
primary.Contact.Website ??= secondary.Contact.Website;
primary.Contact.LinkedIn ??= secondary.Contact.LinkedIn;
primary.Summary = primary.Summary.Count == 0
? secondary.Summary
: primary.Summary.Concat(secondary.Summary).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
if (primary.Jobs.Count == 0) primary.Jobs = secondary.Jobs;
if (primary.Education.Count == 0) primary.Education = secondary.Education;
primary.Skills = primary.Skills.Count == 0
? secondary.Skills
: primary.Skills.Concat(secondary.Skills).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
primary.Languages = primary.Languages.Count == 0
? secondary.Languages
: primary.Languages
.Concat(secondary.Languages)
.GroupBy(language => language.Name ?? string.Empty, StringComparer.OrdinalIgnoreCase)
.Select(group => group.First())
.ToList();
primary.Interests = primary.Interests.Count == 0
? secondary.Interests
: primary.Interests.Concat(secondary.Interests).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
if (primary.OtherSections.Count == 0) primary.OtherSections = secondary.OtherSections;
if (primary.Sections.Count == 0) primary.Sections = secondary.Sections;
return Normalize(primary);
}
public static StructuredCvProfile FromSections(IEnumerable<StructuredCvSection>? sections)
{
var normalizedSections = NormalizeSections(sections);
var profile = new StructuredCvProfile
{
Sections = normalizedSections,
};
foreach (var section in normalizedSections)
{
switch (section.Name.Trim().ToLowerInvariant())
{
case "contact":
ApplyContact(profile.Contact, section.Content);
break;
case "professional summary":
case "summary":
profile.Summary = SplitList(section.Content);
break;
case "skills":
case "core skills":
case "technical skills":
profile.Skills = SplitList(section.Content);
break;
case "languages":
profile.Languages = ParseLanguages(section.Content);
break;
case "interests":
profile.Interests = SplitList(section.Content);
break;
case "work experience":
case "experience":
case "employment history":
profile.Jobs = ParseJobs(section.Content);
break;
case "education":
profile.Education = ParseEducation(section.Content);
break;
default:
profile.OtherSections.Add(new StructuredCvOtherSection
{
Title = section.Name,
Items = SplitList(section.Content),
});
break;
}
}
return Normalize(profile);
}
public static StructuredCvProfile Normalize(StructuredCvProfile? profile)
{
profile ??= new StructuredCvProfile();
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
profile.Contact ??= new StructuredCvContact();
profile.Summary = CleanList(profile.Summary);
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
.Select(NormalizeJob)
.Where(job => !string.IsNullOrWhiteSpace(job.Title)
|| !string.IsNullOrWhiteSpace(job.Company)
|| job.Bullets.Count > 0)
.ToList();
profile.Education = (profile.Education ?? new List<StructuredCvEducation>())
.Select(NormalizeEducation)
.Where(education => !string.IsNullOrWhiteSpace(education.Qualification)
|| !string.IsNullOrWhiteSpace(education.Institution)
|| education.Details.Count > 0)
.ToList();
profile.Skills = CleanList(profile.Skills);
profile.Languages = (profile.Languages ?? new List<StructuredCvLanguage>())
.Select(NormalizeLanguage)
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList();
profile.Interests = CleanList(profile.Interests);
profile.OtherSections = (profile.OtherSections ?? new List<StructuredCvOtherSection>())
.Select(section => new StructuredCvOtherSection
{
Title = TrimOrNull(section?.Title),
Items = CleanList(section?.Items),
})
.Where(section => !string.IsNullOrWhiteSpace(section.Title) || section.Items.Count > 0)
.ToList();
var normalizedSections = NormalizeSections(profile.Sections);
profile.Sections = normalizedSections.Count > 0 ? normalizedSections : BuildSections(profile);
return profile;
}
private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
{
job ??= new StructuredCvJob();
job.Title = TrimOrNull(job.Title);
job.Company = TrimOrNull(job.Company);
job.Location = TrimOrNull(job.Location);
job.Start = TrimOrNull(job.Start);
job.End = TrimOrNull(job.End);
job.Bullets = CleanList(job.Bullets);
job.Skills = CleanList(job.Skills);
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
return job;
}
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
{
education ??= new StructuredCvEducation();
education.Qualification = TrimOrNull(education.Qualification);
education.Institution = TrimOrNull(education.Institution);
education.Location = TrimOrNull(education.Location);
education.Start = TrimOrNull(education.Start);
education.End = TrimOrNull(education.End);
education.Details = CleanList(education.Details);
return education;
}
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
{
language ??= new StructuredCvLanguage();
language.Name = TrimOrNull(language.Name);
language.Level = TrimOrNull(language.Level);
language.Notes = TrimOrNull(language.Notes);
return language;
}
private static List<StructuredCvSection> NormalizeSections(IEnumerable<StructuredCvSection>? sections)
{
return (sections ?? Array.Empty<StructuredCvSection>())
.Select(section => new StructuredCvSection
{
Name = string.IsNullOrWhiteSpace(section?.Name) ? "General" : section.Name.Trim(),
Content = section?.Content?.Trim() ?? string.Empty,
WordCount = section?.WordCount is > 0 ? section.WordCount : CountWords(section?.Content),
})
.Where(section => !string.IsNullOrWhiteSpace(section.Content))
.ToList();
}
private static List<StructuredCvSection> BuildSections(StructuredCvProfile profile)
{
var sections = new List<StructuredCvSection>();
var contactLines = new List<string>();
AddIf(contactLines, profile.Contact.FullName);
AddIf(contactLines, profile.Contact.Headline);
AddIf(contactLines, profile.Contact.Email);
AddIf(contactLines, profile.Contact.Phone);
AddIf(contactLines, profile.Contact.Location);
AddIf(contactLines, profile.Contact.Website);
AddIf(contactLines, profile.Contact.LinkedIn);
AddSectionIfAny(sections, "Contact", contactLines);
AddSectionIfAny(sections, "Professional Summary", profile.Summary);
if (profile.Jobs.Count > 0)
{
var lines = new List<string>();
foreach (var job in profile.Jobs)
{
AddIf(lines, $"### {job.Title}".Trim());
var meta = string.Join(" | ", new[] { job.Company, job.Location, FormatDateRange(job.Start, job.End, job.IsCurrent) }.Where(value => !string.IsNullOrWhiteSpace(value)));
AddIf(lines, meta);
lines.AddRange(job.Bullets.Select(bullet => $"- {bullet}"));
if (job.Skills.Count > 0)
{
lines.Add($"Skills: {string.Join(", ", job.Skills)}");
}
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
}
AddSectionIfAny(sections, "Work Experience", lines);
}
if (profile.Education.Count > 0)
{
var lines = new List<string>();
foreach (var education in profile.Education)
{
AddIf(lines, $"### {education.Qualification}".Trim());
var meta = string.Join(" | ", new[] { education.Institution, education.Location, FormatDateRange(education.Start, education.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value)));
AddIf(lines, meta);
lines.AddRange(education.Details.Select(detail => $"- {detail}"));
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
}
AddSectionIfAny(sections, "Education", lines);
}
AddSectionIfAny(sections, "Skills", profile.Skills);
if (profile.Languages.Count > 0)
{
AddSectionIfAny(sections, "Languages", profile.Languages.Select(language =>
{
var value = language.Name ?? string.Empty;
if (!string.IsNullOrWhiteSpace(language.Level)) value += $": {language.Level}";
if (!string.IsNullOrWhiteSpace(language.Notes)) value += $" ({language.Notes})";
return value;
}).ToList());
}
AddSectionIfAny(sections, "Interests", profile.Interests);
foreach (var other in profile.OtherSections)
{
AddSectionIfAny(sections, other.Title ?? "Other", other.Items);
}
return NormalizeSections(sections);
}
private static void AddSectionIfAny(List<StructuredCvSection> sections, string name, IEnumerable<string>? lines)
{
var content = string.Join("\n", (lines ?? Array.Empty<string>()).Where(line => !string.IsNullOrWhiteSpace(line)).Select(line => line.Trim())).Trim();
if (string.IsNullOrWhiteSpace(content)) return;
sections.Add(new StructuredCvSection { Name = name, Content = content, WordCount = CountWords(content) });
}
private static void AddIf(List<string> lines, string? value)
{
if (!string.IsNullOrWhiteSpace(value)) lines.Add(value.Trim());
}
private static void ApplyContact(StructuredCvContact contact, string content)
{
var lines = content.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
contact.Email ??= Regex.Match(content, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
contact.Phone ??= Regex.Match(content, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
foreach (var line in lines)
{
if (contact.LinkedIn is null && line.Contains("linkedin", StringComparison.OrdinalIgnoreCase))
{
contact.LinkedIn = line.Trim();
continue;
}
if (contact.Website is null && (line.Contains("http", StringComparison.OrdinalIgnoreCase) || line.Contains("www.", StringComparison.OrdinalIgnoreCase) || line.Contains(".") && !line.Contains('@')))
{
contact.Website = line.Trim();
}
}
var leftovers = lines.Where(line => !line.Contains('@') && !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase)).ToList();
if (leftovers.Count > 0) contact.FullName ??= leftovers[0].Trim();
if (leftovers.Count > 1) contact.Headline ??= leftovers[1].Trim();
if (leftovers.Count > 2) contact.Location ??= leftovers[2].Trim();
}
private static List<StructuredCvLanguage> ParseLanguages(string content)
{
return SplitList(content)
.Select(item =>
{
var name = item;
string? level = null;
string? notes = null;
var colonIndex = item.IndexOf(':');
if (colonIndex > 0)
{
name = item[..colonIndex].Trim();
var remainder = item[(colonIndex + 1)..].Trim();
var noteMatch = Regex.Match(remainder, @"^(.*?)\s*\((.*?)\)$");
if (noteMatch.Success)
{
level = noteMatch.Groups[1].Value.NullIfWhitespace();
notes = noteMatch.Groups[2].Value.NullIfWhitespace();
}
else
{
level = remainder.NullIfWhitespace();
}
}
return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes };
})
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList();
}
private static List<StructuredCvJob> ParseJobs(string content)
{
var blocks = SplitBlocks(content);
return blocks.Select(ParseJobBlock).Where(job => job is not null).Select(job => job!).ToList();
}
private static StructuredCvJob? ParseJobBlock(string block)
{
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) return null;
var job = new StructuredCvJob();
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
job.Title = lines[0].NullIfWhitespace();
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)(?:\s*[-]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue))
{
var parts = Regex.Split(dateValue, "\\s*[-]\\s*");
job.Start = parts.FirstOrDefault().NullIfWhitespace();
job.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
if (metadataWithoutDates.Count > 0) job.Company = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) job.Location = metadataWithoutDates[1].NullIfWhitespace();
job.Bullets = lines.Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
job.Skills = lines
.Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase))
.SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..]))
.ToList();
return string.IsNullOrWhiteSpace(job.Title) && string.IsNullOrWhiteSpace(job.Company) && job.Bullets.Count == 0 ? null : job;
}
private static List<StructuredCvEducation> ParseEducation(string content)
{
var blocks = SplitBlocks(content);
return blocks.Select(ParseEducationBlock).Where(education => education is not null).Select(education => education!).ToList();
}
private static StructuredCvEducation? ParseEducationBlock(string block)
{
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) return null;
var education = new StructuredCvEducation();
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
education.Qualification = lines[0].NullIfWhitespace();
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4})(?:\s*[-]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue))
{
var parts = Regex.Split(dateValue, "\\s*[-]\\s*");
education.Start = parts.FirstOrDefault().NullIfWhitespace();
education.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
if (metadataWithoutDates.Count > 0) education.Institution = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) education.Location = metadataWithoutDates[1].NullIfWhitespace();
education.Details = lines.Skip(1).Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
return string.IsNullOrWhiteSpace(education.Qualification) && string.IsNullOrWhiteSpace(education.Institution) && education.Details.Count == 0 ? null : education;
}
private static List<string> SplitBlocks(string content)
{
var normalized = content.Replace("\r\n", "\n").Trim();
if (string.IsNullOrWhiteSpace(normalized)) return new List<string>();
if (normalized.Contains("### ", StringComparison.Ordinal))
{
return Regex.Split(normalized, @"(?=^###\s+)" , RegexOptions.Multiline)
.Select(block => block.Trim())
.Where(block => !string.IsNullOrWhiteSpace(block))
.ToList();
}
return Regex.Split(normalized, @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => !string.IsNullOrWhiteSpace(block))
.ToList();
}
private static bool IsBullet(string value)
{
var trimmed = value.TrimStart();
return trimmed.StartsWith("-", StringComparison.Ordinal)
|| trimmed.StartsWith("•", StringComparison.Ordinal)
|| trimmed.StartsWith("*", StringComparison.Ordinal);
}
private static List<string> SplitList(string? content)
{
if (string.IsNullOrWhiteSpace(content)) return new List<string>();
return content
.Replace("\r\n", "\n")
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.SelectMany(line => line.Contains(',') && !line.TrimStart().StartsWith("-", StringComparison.Ordinal)
? line.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
: new[] { line })
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
.Where(item => !string.IsNullOrWhiteSpace(item))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static List<string> CleanList(IEnumerable<string>? values)
{
return (values ?? Array.Empty<string>())
.Select(value => value?.Trim() ?? string.Empty)
.Where(value => !string.IsNullOrWhiteSpace(value))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static int CountWords(string? content)
{
if (string.IsNullOrWhiteSpace(content)) return 0;
return content.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
}
private static string? TrimOrNull(string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
private static string? FormatDateRange(string? start, string? end, bool isCurrent)
{
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
if (string.IsNullOrWhiteSpace(start)) return end;
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
}
private static string? NullIfWhitespace(this string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
}