Add typed structured CV extraction
This commit is contained in:
@@ -26,6 +26,31 @@ public sealed class ProfileCvController : ControllerBase
|
||||
".webp",
|
||||
};
|
||||
|
||||
private static readonly Dictionary<string, string> SectionAliases = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
["professional summary"] = "Professional Summary",
|
||||
["summary"] = "Professional Summary",
|
||||
["profile"] = "Professional Summary",
|
||||
["about me"] = "Professional Summary",
|
||||
["contact"] = "Contact",
|
||||
["contact details"] = "Contact",
|
||||
["core skills"] = "Skills",
|
||||
["skills"] = "Skills",
|
||||
["technical skills"] = "Skills",
|
||||
["experience"] = "Work Experience",
|
||||
["experience highlights"] = "Work Experience",
|
||||
["work experience"] = "Work Experience",
|
||||
["employment history"] = "Work Experience",
|
||||
["selected achievements"] = "Selected Achievements",
|
||||
["achievements"] = "Selected Achievements",
|
||||
["projects"] = "Projects",
|
||||
["education"] = "Education",
|
||||
["certifications"] = "Certifications",
|
||||
["certificates"] = "Certifications",
|
||||
["languages"] = "Languages",
|
||||
["interests"] = "Interests",
|
||||
};
|
||||
|
||||
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
|
||||
|
||||
private readonly UserManager<ApplicationUser> _users;
|
||||
@@ -39,7 +64,6 @@ public sealed class ProfileCvController : ControllerBase
|
||||
|
||||
public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole);
|
||||
public sealed record ParseCvRequest(string? Text);
|
||||
public sealed record ParsedCvSectionDto(string Name, string Content, int WordCount);
|
||||
|
||||
[HttpPost("upload")]
|
||||
[RequestSizeLimit(MaxFileSizeBytes)]
|
||||
@@ -86,16 +110,18 @@ public sealed class ProfileCvController : ControllerBase
|
||||
return BadRequest("The uploaded CV file could not be read or was empty.");
|
||||
}
|
||||
|
||||
text = (await MaybeReconstructStructuredCvAsync(text, HttpContext.RequestAborted)).Trim();
|
||||
var structuredCv = await BuildStructuredCvAsync(text, HttpContext.RequestAborted);
|
||||
|
||||
user.ProfileCvText = text;
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(
|
||||
ParseSections(text).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList());
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { imported = true, characters = text.Length });
|
||||
return Ok(new { imported = true, characters = text.Length, structuredCv, sections = structuredCv.Sections });
|
||||
}
|
||||
|
||||
[HttpPost("rebuild")]
|
||||
@@ -117,15 +143,15 @@ public sealed class ProfileCvController : ControllerBase
|
||||
}
|
||||
|
||||
user.ProfileCvText = rebuilt.Trim();
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(
|
||||
ParseSections(user.ProfileCvText).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList());
|
||||
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText });
|
||||
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
|
||||
}
|
||||
|
||||
[HttpPost("rewrite-section")]
|
||||
@@ -162,18 +188,15 @@ public sealed class ProfileCvController : ControllerBase
|
||||
var source = string.IsNullOrWhiteSpace(request?.Text) ? user.ProfileCvText : request!.Text;
|
||||
if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections.");
|
||||
|
||||
var sections = ParseSections(source)
|
||||
.Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content)))
|
||||
.ToList();
|
||||
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(sections);
|
||||
var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var update = await _users.UpdateAsync(user);
|
||||
if (!update.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", update.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { sections, totalWords = CountWords(source) });
|
||||
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source) });
|
||||
}
|
||||
|
||||
[HttpPost("improve")]
|
||||
@@ -195,15 +218,91 @@ public sealed class ProfileCvController : ControllerBase
|
||||
}
|
||||
|
||||
user.ProfileCvText = improved.Trim();
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(
|
||||
ParseSections(user.ProfileCvText).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList());
|
||||
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText });
|
||||
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
|
||||
}
|
||||
|
||||
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var fallbackSections = ParseSections(text)
|
||||
.Select(section => new StructuredCvSection
|
||||
{
|
||||
Name = section.Name,
|
||||
Content = section.Content,
|
||||
WordCount = CountWords(section.Content),
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var fallback = StructuredCvProfileJson.FromSections(fallbackSections);
|
||||
fallback.Contact.FullName ??= GuessFullName(text);
|
||||
var extracted = await TryExtractStructuredCvAsync(text, cancellationToken);
|
||||
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
|
||||
merged.Contact.FullName ??= GuessFullName(text);
|
||||
return StructuredCvProfileJson.Normalize(merged);
|
||||
}
|
||||
|
||||
private async Task<StructuredCvProfile?> TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var structuredJson = await _aiService.SummarizeSectionAsync(
|
||||
"Extract this CV into structured JSON. Return only valid JSON with this exact top-level shape: { \"version\": \"1\", \"contact\": { \"fullName\": string|null, \"headline\": string|null, \"email\": string|null, \"phone\": string|null, \"location\": string|null, \"website\": string|null, \"linkedin\": string|null }, \"summary\": string[], \"jobs\": [{ \"title\": string|null, \"company\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"isCurrent\": boolean, \"bullets\": string[], \"skills\": string[] }], \"education\": [{ \"qualification\": string|null, \"institution\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"details\": string[] }], \"skills\": string[], \"languages\": [{ \"name\": string|null, \"level\": string|null, \"notes\": string|null }], \"interests\": string[], \"otherSections\": [{ \"title\": string|null, \"items\": string[] }] }. Preserve facts only. Do not invent anything. If a field is unknown, use null or an empty array. Keep wording close to the source. Put unmatched content in otherSections.",
|
||||
text,
|
||||
3200,
|
||||
900);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(structuredJson)) return null;
|
||||
var extracted = ExtractJsonObject(structuredJson);
|
||||
if (string.IsNullOrWhiteSpace(extracted)) return null;
|
||||
|
||||
var parsed = StructuredCvProfileJson.Deserialize(extracted);
|
||||
return IsMeaningfullyStructured(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
private static bool IsMeaningfullyStructured(StructuredCvProfile profile)
|
||||
{
|
||||
return !string.IsNullOrWhiteSpace(profile.Contact.FullName)
|
||||
|| profile.Summary.Count > 0
|
||||
|| profile.Jobs.Count > 0
|
||||
|| profile.Education.Count > 0
|
||||
|| profile.Skills.Count > 0
|
||||
|| profile.Languages.Count > 0
|
||||
|| profile.Interests.Count > 0
|
||||
|| profile.OtherSections.Count > 0;
|
||||
}
|
||||
|
||||
private static string? ExtractJsonObject(string raw)
|
||||
{
|
||||
var trimmed = raw.Trim();
|
||||
if (trimmed.StartsWith("```", StringComparison.Ordinal))
|
||||
{
|
||||
trimmed = Regex.Replace(trimmed, "^```(?:json)?\\s*|\\s*```$", string.Empty, RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
var start = trimmed.IndexOf('{');
|
||||
var end = trimmed.LastIndexOf('}');
|
||||
if (start < 0 || end <= start) return null;
|
||||
return trimmed[start..(end + 1)];
|
||||
}
|
||||
|
||||
private static string? GuessFullName(string source)
|
||||
{
|
||||
var normalized = source.Replace("\r\n", "\n");
|
||||
foreach (var line in normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(6))
|
||||
{
|
||||
var cleaned = line.Trim().TrimStart('#').Trim();
|
||||
if (cleaned.Length < 4 || cleaned.Length > 80) continue;
|
||||
if (cleaned.Contains('@') || Regex.IsMatch(cleaned, @"\d")) continue;
|
||||
if (!Regex.IsMatch(cleaned, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,4}$")) continue;
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static int CountWords(string? text)
|
||||
@@ -215,25 +314,6 @@ public sealed class ProfileCvController : ControllerBase
|
||||
private static List<(string Name, string Content)> ParseSections(string source)
|
||||
{
|
||||
var lines = source.Replace("\r\n", "\n").Split('\n');
|
||||
var aliases = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
["professional summary"] = "Professional Summary",
|
||||
["summary"] = "Professional Summary",
|
||||
["profile"] = "Professional Summary",
|
||||
["core skills"] = "Core Skills",
|
||||
["skills"] = "Core Skills",
|
||||
["technical skills"] = "Core Skills",
|
||||
["experience"] = "Experience Highlights",
|
||||
["experience highlights"] = "Experience Highlights",
|
||||
["work experience"] = "Experience Highlights",
|
||||
["selected achievements"] = "Selected Achievements",
|
||||
["achievements"] = "Selected Achievements",
|
||||
["projects"] = "Projects",
|
||||
["education"] = "Education",
|
||||
["certifications"] = "Certifications",
|
||||
["certificates"] = "Certifications",
|
||||
};
|
||||
|
||||
var sections = new List<(string Name, List<string> Lines)>();
|
||||
var currentName = "General";
|
||||
var currentLines = new List<string>();
|
||||
@@ -251,16 +331,11 @@ public sealed class ProfileCvController : ControllerBase
|
||||
foreach (var raw in lines)
|
||||
{
|
||||
var line = raw.Trim();
|
||||
var normalized = line.TrimEnd(':').Trim();
|
||||
var looksLikeHeading = normalized.Length > 0
|
||||
&& normalized.Length <= 40
|
||||
&& !normalized.Contains('.')
|
||||
&& aliases.ContainsKey(normalized.ToLowerInvariant());
|
||||
|
||||
if (looksLikeHeading)
|
||||
var canonicalHeading = CanonicalizeSectionHeading(line);
|
||||
if (canonicalHeading is not null)
|
||||
{
|
||||
Flush();
|
||||
currentName = aliases[normalized.ToLowerInvariant()];
|
||||
currentName = canonicalHeading;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -280,6 +355,56 @@ public sealed class ProfileCvController : ControllerBase
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var normalized = text.Trim();
|
||||
if (!LooksLikeFlattenedCvExtraction(normalized))
|
||||
{
|
||||
return normalized;
|
||||
}
|
||||
|
||||
var reconstructed = await _aiService.SummarizeSectionAsync(
|
||||
"Reconstruct this CV text extracted from a PDF into a clean, readable master CV in markdown. Preserve facts only. Recover clear sections such as Contact, Professional Summary, Work Experience, Education, Skills, Languages, and Interests when present. Split contact details onto their own lines, turn noisy all-caps/spaced headings into normal headings, keep dates with the correct roles and employers, and remove layout/OCR artifacts. Do not invent employers, titles, dates, or metrics. Return only the reconstructed CV text.",
|
||||
normalized,
|
||||
2800,
|
||||
900);
|
||||
|
||||
return string.IsNullOrWhiteSpace(reconstructed) ? normalized : reconstructed.Trim();
|
||||
}
|
||||
|
||||
private static bool LooksLikeFlattenedCvExtraction(string text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return false;
|
||||
|
||||
var normalized = text.Replace("\r\n", "\n");
|
||||
var lineCount = normalized.Split('\n').Count(line => !string.IsNullOrWhiteSpace(line));
|
||||
var spacedHeadingCount = Regex.Matches(normalized, @"\b(?:[A-Z]\s){3,}[A-Z]\b").Count;
|
||||
var knownHeadingHits = SectionAliases.Keys.Count(alias => normalized.Contains(alias, StringComparison.OrdinalIgnoreCase));
|
||||
var bulletCount = Regex.Matches(normalized, @"[•●▪◦]").Count;
|
||||
|
||||
return (lineCount <= 6 && normalized.Length >= 500)
|
||||
|| spacedHeadingCount >= 3
|
||||
|| (knownHeadingHits >= 3 && lineCount <= 12)
|
||||
|| (normalized.Contains(" + ") && bulletCount > 0 && lineCount <= 10);
|
||||
}
|
||||
|
||||
private static string? CanonicalizeSectionHeading(string line)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(line)) return null;
|
||||
|
||||
var normalized = line.Trim();
|
||||
if (normalized.StartsWith("#", StringComparison.Ordinal))
|
||||
{
|
||||
normalized = normalized.TrimStart('#').Trim();
|
||||
}
|
||||
|
||||
normalized = normalized.TrimEnd(':').Trim();
|
||||
if (normalized.Length == 0 || normalized.Length > 60) return null;
|
||||
if (normalized.Contains('.') || normalized.Contains(" ")) return null;
|
||||
|
||||
return SectionAliases.TryGetValue(normalized, out var canonical) ? canonical : null;
|
||||
}
|
||||
|
||||
private static async Task<string> ExtractTextAsync(IFormFile file, string extension)
|
||||
{
|
||||
if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase))
|
||||
|
||||
Reference in New Issue
Block a user