Add typed structured CV extraction

This commit is contained in:
2026-03-28 15:01:32 +01:00
parent 19a4da9382
commit 8f8a34ad9c
5 changed files with 1029 additions and 77 deletions
+68
View File
@@ -0,0 +1,68 @@
namespace JobTrackerApi.Models;
public sealed class StructuredCvProfile
{
public string Version { get; set; } = "1";
public StructuredCvContact Contact { get; set; } = new();
public List<string> Summary { get; set; } = new();
public List<StructuredCvJob> Jobs { get; set; } = new();
public List<StructuredCvEducation> Education { get; set; } = new();
public List<string> Skills { get; set; } = new();
public List<StructuredCvLanguage> Languages { get; set; } = new();
public List<string> Interests { get; set; } = new();
public List<StructuredCvOtherSection> OtherSections { get; set; } = new();
public List<StructuredCvSection> Sections { get; set; } = new();
}
public sealed class StructuredCvContact
{
public string? FullName { get; set; }
public string? Headline { get; set; }
public string? Email { get; set; }
public string? Phone { get; set; }
public string? Location { get; set; }
public string? Website { get; set; }
public string? LinkedIn { get; set; }
}
public sealed class StructuredCvJob
{
public string? Title { get; set; }
public string? Company { get; set; }
public string? Location { get; set; }
public string? Start { get; set; }
public string? End { get; set; }
public bool IsCurrent { get; set; }
public List<string> Bullets { get; set; } = new();
public List<string> Skills { get; set; } = new();
}
public sealed class StructuredCvEducation
{
public string? Qualification { get; set; }
public string? Institution { get; set; }
public string? Location { get; set; }
public string? Start { get; set; }
public string? End { get; set; }
public List<string> Details { get; set; } = new();
}
public sealed class StructuredCvLanguage
{
public string? Name { get; set; }
public string? Level { get; set; }
public string? Notes { get; set; }
}
public sealed class StructuredCvOtherSection
{
public string? Title { get; set; }
public List<string> Items { get; set; } = new();
}
public sealed class StructuredCvSection
{
public string Name { get; set; } = string.Empty;
public string Content { get; set; } = string.Empty;
public int WordCount { get; set; }
}
+491
View File
@@ -0,0 +1,491 @@
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;
namespace JobTrackerApi.Models;
public static class StructuredCvProfileJson
{
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web)
{
PropertyNameCaseInsensitive = true,
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
};
public static StructuredCvProfile Empty() => Normalize(new StructuredCvProfile());
public static StructuredCvProfile Deserialize(string? json)
{
if (string.IsNullOrWhiteSpace(json)) return Empty();
try
{
using var doc = JsonDocument.Parse(json);
if (doc.RootElement.ValueKind == JsonValueKind.Array)
{
var sections = JsonSerializer.Deserialize<List<StructuredCvSection>>(json, SerializerOptions) ?? new List<StructuredCvSection>();
return FromSections(sections);
}
if (doc.RootElement.ValueKind != JsonValueKind.Object) return Empty();
var profile = JsonSerializer.Deserialize<StructuredCvProfile>(json, SerializerOptions) ?? new StructuredCvProfile();
return Normalize(profile);
}
catch
{
return Empty();
}
}
public static string Serialize(StructuredCvProfile? profile)
{
return JsonSerializer.Serialize(Normalize(profile), SerializerOptions);
}
public static StructuredCvProfile Merge(StructuredCvProfile? preferred, StructuredCvProfile? fallback)
{
var primary = Normalize(preferred);
var secondary = Normalize(fallback);
primary.Contact.FullName ??= secondary.Contact.FullName;
primary.Contact.Headline ??= secondary.Contact.Headline;
primary.Contact.Email ??= secondary.Contact.Email;
primary.Contact.Phone ??= secondary.Contact.Phone;
primary.Contact.Location ??= secondary.Contact.Location;
primary.Contact.Website ??= secondary.Contact.Website;
primary.Contact.LinkedIn ??= secondary.Contact.LinkedIn;
if (primary.Summary.Count == 0) primary.Summary = secondary.Summary;
if (primary.Jobs.Count == 0) primary.Jobs = secondary.Jobs;
if (primary.Education.Count == 0) primary.Education = secondary.Education;
if (primary.Skills.Count == 0) primary.Skills = secondary.Skills;
if (primary.Languages.Count == 0) primary.Languages = secondary.Languages;
if (primary.Interests.Count == 0) primary.Interests = secondary.Interests;
if (primary.OtherSections.Count == 0) primary.OtherSections = secondary.OtherSections;
if (primary.Sections.Count == 0) primary.Sections = secondary.Sections;
return Normalize(primary);
}
public static StructuredCvProfile FromSections(IEnumerable<StructuredCvSection>? sections)
{
var normalizedSections = NormalizeSections(sections);
var profile = new StructuredCvProfile
{
Sections = normalizedSections,
};
foreach (var section in normalizedSections)
{
switch (section.Name.Trim().ToLowerInvariant())
{
case "contact":
ApplyContact(profile.Contact, section.Content);
break;
case "professional summary":
case "summary":
profile.Summary = SplitList(section.Content);
break;
case "skills":
case "core skills":
case "technical skills":
profile.Skills = SplitList(section.Content);
break;
case "languages":
profile.Languages = ParseLanguages(section.Content);
break;
case "interests":
profile.Interests = SplitList(section.Content);
break;
case "work experience":
case "experience":
case "employment history":
profile.Jobs = ParseJobs(section.Content);
break;
case "education":
profile.Education = ParseEducation(section.Content);
break;
default:
profile.OtherSections.Add(new StructuredCvOtherSection
{
Title = section.Name,
Items = SplitList(section.Content),
});
break;
}
}
return Normalize(profile);
}
public static StructuredCvProfile Normalize(StructuredCvProfile? profile)
{
profile ??= new StructuredCvProfile();
profile.Version = string.IsNullOrWhiteSpace(profile.Version) ? "1" : profile.Version.Trim();
profile.Contact ??= new StructuredCvContact();
profile.Summary = CleanList(profile.Summary);
profile.Jobs = (profile.Jobs ?? new List<StructuredCvJob>())
.Select(NormalizeJob)
.Where(job => !string.IsNullOrWhiteSpace(job.Title)
|| !string.IsNullOrWhiteSpace(job.Company)
|| job.Bullets.Count > 0)
.ToList();
profile.Education = (profile.Education ?? new List<StructuredCvEducation>())
.Select(NormalizeEducation)
.Where(education => !string.IsNullOrWhiteSpace(education.Qualification)
|| !string.IsNullOrWhiteSpace(education.Institution)
|| education.Details.Count > 0)
.ToList();
profile.Skills = CleanList(profile.Skills);
profile.Languages = (profile.Languages ?? new List<StructuredCvLanguage>())
.Select(NormalizeLanguage)
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList();
profile.Interests = CleanList(profile.Interests);
profile.OtherSections = (profile.OtherSections ?? new List<StructuredCvOtherSection>())
.Select(section => new StructuredCvOtherSection
{
Title = TrimOrNull(section?.Title),
Items = CleanList(section?.Items),
})
.Where(section => !string.IsNullOrWhiteSpace(section.Title) || section.Items.Count > 0)
.ToList();
var normalizedSections = NormalizeSections(profile.Sections);
profile.Sections = normalizedSections.Count > 0 ? normalizedSections : BuildSections(profile);
return profile;
}
private static StructuredCvJob NormalizeJob(StructuredCvJob? job)
{
job ??= new StructuredCvJob();
job.Title = TrimOrNull(job.Title);
job.Company = TrimOrNull(job.Company);
job.Location = TrimOrNull(job.Location);
job.Start = TrimOrNull(job.Start);
job.End = TrimOrNull(job.End);
job.Bullets = CleanList(job.Bullets);
job.Skills = CleanList(job.Skills);
job.IsCurrent = job.IsCurrent || string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
return job;
}
private static StructuredCvEducation NormalizeEducation(StructuredCvEducation? education)
{
education ??= new StructuredCvEducation();
education.Qualification = TrimOrNull(education.Qualification);
education.Institution = TrimOrNull(education.Institution);
education.Location = TrimOrNull(education.Location);
education.Start = TrimOrNull(education.Start);
education.End = TrimOrNull(education.End);
education.Details = CleanList(education.Details);
return education;
}
private static StructuredCvLanguage NormalizeLanguage(StructuredCvLanguage? language)
{
language ??= new StructuredCvLanguage();
language.Name = TrimOrNull(language.Name);
language.Level = TrimOrNull(language.Level);
language.Notes = TrimOrNull(language.Notes);
return language;
}
private static List<StructuredCvSection> NormalizeSections(IEnumerable<StructuredCvSection>? sections)
{
return (sections ?? Array.Empty<StructuredCvSection>())
.Select(section => new StructuredCvSection
{
Name = string.IsNullOrWhiteSpace(section?.Name) ? "General" : section.Name.Trim(),
Content = section?.Content?.Trim() ?? string.Empty,
WordCount = section?.WordCount is > 0 ? section.WordCount : CountWords(section?.Content),
})
.Where(section => !string.IsNullOrWhiteSpace(section.Content))
.ToList();
}
private static List<StructuredCvSection> BuildSections(StructuredCvProfile profile)
{
var sections = new List<StructuredCvSection>();
var contactLines = new List<string>();
AddIf(contactLines, profile.Contact.FullName);
AddIf(contactLines, profile.Contact.Headline);
AddIf(contactLines, profile.Contact.Email);
AddIf(contactLines, profile.Contact.Phone);
AddIf(contactLines, profile.Contact.Location);
AddIf(contactLines, profile.Contact.Website);
AddIf(contactLines, profile.Contact.LinkedIn);
AddSectionIfAny(sections, "Contact", contactLines);
AddSectionIfAny(sections, "Professional Summary", profile.Summary);
if (profile.Jobs.Count > 0)
{
var lines = new List<string>();
foreach (var job in profile.Jobs)
{
AddIf(lines, $"### {job.Title}".Trim());
var meta = string.Join(" | ", new[] { job.Company, job.Location, FormatDateRange(job.Start, job.End, job.IsCurrent) }.Where(value => !string.IsNullOrWhiteSpace(value)));
AddIf(lines, meta);
lines.AddRange(job.Bullets.Select(bullet => $"- {bullet}"));
if (job.Skills.Count > 0)
{
lines.Add($"Skills: {string.Join(", ", job.Skills)}");
}
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
}
AddSectionIfAny(sections, "Work Experience", lines);
}
if (profile.Education.Count > 0)
{
var lines = new List<string>();
foreach (var education in profile.Education)
{
AddIf(lines, $"### {education.Qualification}".Trim());
var meta = string.Join(" | ", new[] { education.Institution, education.Location, FormatDateRange(education.Start, education.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value)));
AddIf(lines, meta);
lines.AddRange(education.Details.Select(detail => $"- {detail}"));
if (lines.Count > 0 && !string.IsNullOrWhiteSpace(lines[^1])) lines.Add(string.Empty);
}
AddSectionIfAny(sections, "Education", lines);
}
AddSectionIfAny(sections, "Skills", profile.Skills);
if (profile.Languages.Count > 0)
{
AddSectionIfAny(sections, "Languages", profile.Languages.Select(language =>
{
var value = language.Name ?? string.Empty;
if (!string.IsNullOrWhiteSpace(language.Level)) value += $": {language.Level}";
if (!string.IsNullOrWhiteSpace(language.Notes)) value += $" ({language.Notes})";
return value;
}).ToList());
}
AddSectionIfAny(sections, "Interests", profile.Interests);
foreach (var other in profile.OtherSections)
{
AddSectionIfAny(sections, other.Title ?? "Other", other.Items);
}
return NormalizeSections(sections);
}
private static void AddSectionIfAny(List<StructuredCvSection> sections, string name, IEnumerable<string>? lines)
{
var content = string.Join("\n", (lines ?? Array.Empty<string>()).Where(line => !string.IsNullOrWhiteSpace(line)).Select(line => line.Trim())).Trim();
if (string.IsNullOrWhiteSpace(content)) return;
sections.Add(new StructuredCvSection { Name = name, Content = content, WordCount = CountWords(content) });
}
private static void AddIf(List<string> lines, string? value)
{
if (!string.IsNullOrWhiteSpace(value)) lines.Add(value.Trim());
}
private static void ApplyContact(StructuredCvContact contact, string content)
{
var lines = content.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
contact.Email ??= Regex.Match(content, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
contact.Phone ??= Regex.Match(content, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value.NullIfWhitespace();
foreach (var line in lines)
{
if (contact.LinkedIn is null && line.Contains("linkedin", StringComparison.OrdinalIgnoreCase))
{
contact.LinkedIn = line.Trim();
continue;
}
if (contact.Website is null && (line.Contains("http", StringComparison.OrdinalIgnoreCase) || line.Contains("www.", StringComparison.OrdinalIgnoreCase) || line.Contains(".") && !line.Contains('@')))
{
contact.Website = line.Trim();
}
}
var leftovers = lines.Where(line => !line.Contains('@') && !line.Contains("linkedin", StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Website, StringComparison.OrdinalIgnoreCase) && !line.Equals(contact.Phone, StringComparison.OrdinalIgnoreCase)).ToList();
if (leftovers.Count > 0) contact.FullName ??= leftovers[0].Trim();
if (leftovers.Count > 1) contact.Headline ??= leftovers[1].Trim();
if (leftovers.Count > 2) contact.Location ??= leftovers[2].Trim();
}
private static List<StructuredCvLanguage> ParseLanguages(string content)
{
return SplitList(content)
.Select(item =>
{
var name = item;
string? level = null;
string? notes = null;
var colonIndex = item.IndexOf(':');
if (colonIndex > 0)
{
name = item[..colonIndex].Trim();
var remainder = item[(colonIndex + 1)..].Trim();
var noteMatch = Regex.Match(remainder, @"^(.*?)\s*\((.*?)\)$");
if (noteMatch.Success)
{
level = noteMatch.Groups[1].Value.NullIfWhitespace();
notes = noteMatch.Groups[2].Value.NullIfWhitespace();
}
else
{
level = remainder.NullIfWhitespace();
}
}
return new StructuredCvLanguage { Name = name.NullIfWhitespace(), Level = level, Notes = notes };
})
.Where(language => !string.IsNullOrWhiteSpace(language.Name))
.ToList();
}
private static List<StructuredCvJob> ParseJobs(string content)
{
var blocks = SplitBlocks(content);
return blocks.Select(ParseJobBlock).Where(job => job is not null).Select(job => job!).ToList();
}
private static StructuredCvJob? ParseJobBlock(string block)
{
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) return null;
var job = new StructuredCvJob();
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
job.Title = lines[0].NullIfWhitespace();
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4}|Present|Current)(?:\s*[-]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue))
{
var parts = Regex.Split(dateValue, "\\s*[-]\\s*");
job.Start = parts.FirstOrDefault().NullIfWhitespace();
job.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
job.IsCurrent = string.Equals(job.End, "present", StringComparison.OrdinalIgnoreCase) || string.Equals(job.End, "current", StringComparison.OrdinalIgnoreCase);
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
if (metadataWithoutDates.Count > 0) job.Company = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) job.Location = metadataWithoutDates[1].NullIfWhitespace();
job.Bullets = lines.Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
job.Skills = lines
.Where(line => line.StartsWith("Skills:", StringComparison.OrdinalIgnoreCase))
.SelectMany(line => SplitList(line[(line.IndexOf(':') + 1)..]))
.ToList();
return string.IsNullOrWhiteSpace(job.Title) && string.IsNullOrWhiteSpace(job.Company) && job.Bullets.Count == 0 ? null : job;
}
private static List<StructuredCvEducation> ParseEducation(string content)
{
var blocks = SplitBlocks(content);
return blocks.Select(ParseEducationBlock).Where(education => education is not null).Select(education => education!).ToList();
}
private static StructuredCvEducation? ParseEducationBlock(string block)
{
var lines = block.Replace("\r\n", "\n").Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
if (lines.Count == 0) return null;
var education = new StructuredCvEducation();
if (lines[0].StartsWith("###", StringComparison.Ordinal)) lines[0] = lines[0].TrimStart('#', ' ');
education.Qualification = lines[0].NullIfWhitespace();
var metadata = lines.Skip(1).TakeWhile(line => !IsBullet(line)).ToList();
var dateValue = metadata.Select(line => Regex.Match(line, @"(?:(?:\w+\s+)?\d{4})(?:\s*[-]\s*(?:(?:\w+\s+)?\d{4}|Present|Current))?", RegexOptions.IgnoreCase).Value.NullIfWhitespace()).FirstOrDefault(value => value is not null);
if (!string.IsNullOrWhiteSpace(dateValue))
{
var parts = Regex.Split(dateValue, "\\s*[-]\\s*");
education.Start = parts.FirstOrDefault().NullIfWhitespace();
education.End = parts.Skip(1).FirstOrDefault().NullIfWhitespace();
}
var metadataWithoutDates = metadata.Select(line => line.Replace(dateValue ?? string.Empty, string.Empty).Trim(' ', '|', ',', '-')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
if (metadataWithoutDates.Count > 0) education.Institution = metadataWithoutDates[0].NullIfWhitespace();
if (metadataWithoutDates.Count > 1) education.Location = metadataWithoutDates[1].NullIfWhitespace();
education.Details = lines.Skip(1).Where(IsBullet).Select(line => line.Trim().TrimStart('-', '•', '*', ' ')).Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
return string.IsNullOrWhiteSpace(education.Qualification) && string.IsNullOrWhiteSpace(education.Institution) && education.Details.Count == 0 ? null : education;
}
private static List<string> SplitBlocks(string content)
{
var normalized = content.Replace("\r\n", "\n").Trim();
if (string.IsNullOrWhiteSpace(normalized)) return new List<string>();
if (normalized.Contains("### ", StringComparison.Ordinal))
{
return Regex.Split(normalized, @"(?=^###\s+)" , RegexOptions.Multiline)
.Select(block => block.Trim())
.Where(block => !string.IsNullOrWhiteSpace(block))
.ToList();
}
return Regex.Split(normalized, @"\n\s*\n")
.Select(block => block.Trim())
.Where(block => !string.IsNullOrWhiteSpace(block))
.ToList();
}
private static bool IsBullet(string value)
{
var trimmed = value.TrimStart();
return trimmed.StartsWith("-", StringComparison.Ordinal)
|| trimmed.StartsWith("•", StringComparison.Ordinal)
|| trimmed.StartsWith("*", StringComparison.Ordinal);
}
private static List<string> SplitList(string? content)
{
if (string.IsNullOrWhiteSpace(content)) return new List<string>();
return content
.Replace("\r\n", "\n")
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.SelectMany(line => line.Contains(',') && !line.TrimStart().StartsWith("-", StringComparison.Ordinal)
? line.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
: new[] { line })
.Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
.Where(item => !string.IsNullOrWhiteSpace(item))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static List<string> CleanList(IEnumerable<string>? values)
{
return (values ?? Array.Empty<string>())
.Select(value => value?.Trim() ?? string.Empty)
.Where(value => !string.IsNullOrWhiteSpace(value))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static int CountWords(string? content)
{
if (string.IsNullOrWhiteSpace(content)) return 0;
return content.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
}
private static string? TrimOrNull(string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
private static string? FormatDateRange(string? start, string? end, bool isCurrent)
{
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
if (string.IsNullOrWhiteSpace(start)) return end;
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
}
private static string? NullIfWhitespace(this string? value)
{
return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
}
}