Add typed structured CV extraction
This commit is contained in:
@@ -58,45 +58,99 @@ namespace JobTrackerApi.Controllers
|
||||
return "Hi there,";
|
||||
}
|
||||
|
||||
private sealed record CvSectionRecord(string? Name, string? Content, int? WordCount);
|
||||
|
||||
private static string BuildStructuredCvContext(ApplicationUser? user)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(user?.ProfileCvStructureJson)) return string.Empty;
|
||||
var structured = StructuredCvProfileJson.Deserialize(user?.ProfileCvStructureJson);
|
||||
var blocks = new List<string>();
|
||||
|
||||
try
|
||||
var contactLines = new List<string>();
|
||||
if (!string.IsNullOrWhiteSpace(structured.Contact.FullName)) contactLines.Add($"Name: {structured.Contact.FullName}");
|
||||
if (!string.IsNullOrWhiteSpace(structured.Contact.Headline)) contactLines.Add($"Headline: {structured.Contact.Headline}");
|
||||
if (!string.IsNullOrWhiteSpace(structured.Contact.Email)) contactLines.Add($"Email: {structured.Contact.Email}");
|
||||
if (!string.IsNullOrWhiteSpace(structured.Contact.Location)) contactLines.Add($"Location: {structured.Contact.Location}");
|
||||
if (!string.IsNullOrWhiteSpace(structured.Contact.LinkedIn)) contactLines.Add($"LinkedIn: {structured.Contact.LinkedIn}");
|
||||
if (contactLines.Count > 0) blocks.Add($"Contact:\n{string.Join("\n", contactLines)}");
|
||||
|
||||
if (structured.Summary.Count > 0)
|
||||
{
|
||||
var sections = JsonSerializer.Deserialize<List<CvSectionRecord>>(user.ProfileCvStructureJson);
|
||||
if (sections is null || sections.Count == 0) return string.Empty;
|
||||
blocks.Add($"Summary:\n- {string.Join("\n- ", structured.Summary.Take(4))}");
|
||||
}
|
||||
|
||||
var preferredOrder = new[]
|
||||
if (structured.Skills.Count > 0)
|
||||
{
|
||||
blocks.Add($"Skills:\n{string.Join(", ", structured.Skills.Take(16))}");
|
||||
}
|
||||
|
||||
if (structured.Jobs.Count > 0)
|
||||
{
|
||||
var jobBlocks = structured.Jobs.Take(3).Select(job =>
|
||||
{
|
||||
"Professional Summary",
|
||||
"Core Skills",
|
||||
"Experience Highlights",
|
||||
"Selected Achievements",
|
||||
"Projects",
|
||||
"Education",
|
||||
"Certifications",
|
||||
};
|
||||
|
||||
var ordered = preferredOrder
|
||||
.Select(name => sections.FirstOrDefault(section => string.Equals(section.Name?.Trim(), name, StringComparison.OrdinalIgnoreCase)))
|
||||
.Where(section => section is not null)
|
||||
.Concat(sections.Where(section => !preferredOrder.Contains(section.Name ?? string.Empty, StringComparer.OrdinalIgnoreCase)))
|
||||
.Where(section => !string.IsNullOrWhiteSpace(section?.Content))
|
||||
.Take(6)
|
||||
.Select(section => $"{section!.Name}:\n{section.Content!.Trim()}")
|
||||
.ToList();
|
||||
|
||||
return ordered.Count > 0
|
||||
? $"Structured CV sections:\n{string.Join("\n\n", ordered)}"
|
||||
: string.Empty;
|
||||
var header = string.Join(" | ", new[] { job.Title, job.Company, job.Location, FormatStructuredDateRange(job.Start, job.End, job.IsCurrent) }.Where(value => !string.IsNullOrWhiteSpace(value)));
|
||||
var bullets = job.Bullets.Take(3).Select(bullet => $"- {bullet}");
|
||||
return string.Join("\n", new[] { header }.Concat(bullets).Where(value => !string.IsNullOrWhiteSpace(value)));
|
||||
}).Where(value => !string.IsNullOrWhiteSpace(value)).ToList();
|
||||
if (jobBlocks.Count > 0) blocks.Add($"Work Experience:\n{string.Join("\n\n", jobBlocks)}");
|
||||
}
|
||||
catch
|
||||
|
||||
if (structured.Education.Count > 0)
|
||||
{
|
||||
return string.Empty;
|
||||
var items = structured.Education.Take(3).Select(education => string.Join(" | ", new[] { education.Qualification, education.Institution, education.Location, FormatStructuredDateRange(education.Start, education.End, false) }.Where(value => !string.IsNullOrWhiteSpace(value))));
|
||||
blocks.Add($"Education:\n- {string.Join("\n- ", items)}");
|
||||
}
|
||||
|
||||
if (structured.Languages.Count > 0)
|
||||
{
|
||||
var items = structured.Languages.Take(5).Select(language => string.Join(": ", new[] { language.Name, language.Level }.Where(value => !string.IsNullOrWhiteSpace(value))));
|
||||
blocks.Add($"Languages:\n- {string.Join("\n- ", items)}");
|
||||
}
|
||||
|
||||
if (structured.OtherSections.Count > 0)
|
||||
{
|
||||
var items = structured.OtherSections.Take(2)
|
||||
.Where(section => !string.IsNullOrWhiteSpace(section.Title) && section.Items.Count > 0)
|
||||
.Select(section => $"{section.Title}: {string.Join("; ", section.Items.Take(4))}")
|
||||
.ToList();
|
||||
if (items.Count > 0) blocks.Add($"Other sections:\n- {string.Join("\n- ", items)}");
|
||||
}
|
||||
|
||||
if (blocks.Count == 0 && structured.Sections.Count > 0)
|
||||
{
|
||||
blocks.AddRange(structured.Sections.Take(6).Select(section => $"{section.Name}:\n{section.Content}"));
|
||||
}
|
||||
|
||||
return blocks.Count > 0
|
||||
? $"Structured CV:\n{string.Join("\n\n", blocks)}"
|
||||
: string.Empty;
|
||||
}
|
||||
|
||||
private static string BuildCvSearchCorpus(ApplicationUser? user)
|
||||
{
|
||||
var structured = StructuredCvProfileJson.Deserialize(user?.ProfileCvStructureJson);
|
||||
var parts = new List<string>();
|
||||
if (!string.IsNullOrWhiteSpace(user?.ProfileCvText)) parts.Add(user.ProfileCvText!);
|
||||
if (!string.IsNullOrWhiteSpace(structured.Contact.Headline)) parts.Add(structured.Contact.Headline!);
|
||||
if (structured.Summary.Count > 0) parts.Add(string.Join("\n", structured.Summary));
|
||||
if (structured.Skills.Count > 0) parts.Add(string.Join("\n", structured.Skills));
|
||||
if (structured.Jobs.Count > 0)
|
||||
{
|
||||
parts.Add(string.Join("\n", structured.Jobs.SelectMany(job => new[] { job.Title, job.Company, job.Location }.Where(value => !string.IsNullOrWhiteSpace(value)).Concat(job.Bullets).Concat(job.Skills))));
|
||||
}
|
||||
if (structured.Education.Count > 0)
|
||||
{
|
||||
parts.Add(string.Join("\n", structured.Education.SelectMany(education => new[] { education.Qualification, education.Institution, education.Location }.Where(value => !string.IsNullOrWhiteSpace(value)).Concat(education.Details))));
|
||||
}
|
||||
if (structured.Languages.Count > 0)
|
||||
{
|
||||
parts.Add(string.Join("\n", structured.Languages.Select(language => string.Join(" ", new[] { language.Name, language.Level, language.Notes }.Where(value => !string.IsNullOrWhiteSpace(value))))));
|
||||
}
|
||||
return string.Join("\n", parts.Where(part => !string.IsNullOrWhiteSpace(part)));
|
||||
}
|
||||
|
||||
private static string? FormatStructuredDateRange(string? start, string? end, bool isCurrent)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(start) && string.IsNullOrWhiteSpace(end)) return null;
|
||||
if (string.IsNullOrWhiteSpace(start)) return end;
|
||||
return $"{start} - {(isCurrent ? "Present" : end ?? "Present")}";
|
||||
}
|
||||
|
||||
private async Task<List<string>> BuildListFromAiAsync(string instruction, string context, CancellationToken cancellationToken, string fallbackPrefix)
|
||||
@@ -1729,7 +1783,7 @@ namespace JobTrackerApi.Controllers
|
||||
return BadRequest("This job does not have enough description or notes to compare against your CV.");
|
||||
}
|
||||
|
||||
var normalizedCv = cvText.ToLowerInvariant();
|
||||
var normalizedCv = BuildCvSearchCorpus(user).ToLowerInvariant();
|
||||
var jobTags = SkillTagger.Detect(jobText).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||||
var strengths = jobTags.Where(tag => normalizedCv.Contains(tag.ToLowerInvariant())).Take(8).ToList();
|
||||
var gaps = jobTags.Where(tag => !normalizedCv.Contains(tag.ToLowerInvariant())).Take(8).ToList();
|
||||
|
||||
@@ -26,6 +26,31 @@ public sealed class ProfileCvController : ControllerBase
|
||||
".webp",
|
||||
};
|
||||
|
||||
private static readonly Dictionary<string, string> SectionAliases = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
["professional summary"] = "Professional Summary",
|
||||
["summary"] = "Professional Summary",
|
||||
["profile"] = "Professional Summary",
|
||||
["about me"] = "Professional Summary",
|
||||
["contact"] = "Contact",
|
||||
["contact details"] = "Contact",
|
||||
["core skills"] = "Skills",
|
||||
["skills"] = "Skills",
|
||||
["technical skills"] = "Skills",
|
||||
["experience"] = "Work Experience",
|
||||
["experience highlights"] = "Work Experience",
|
||||
["work experience"] = "Work Experience",
|
||||
["employment history"] = "Work Experience",
|
||||
["selected achievements"] = "Selected Achievements",
|
||||
["achievements"] = "Selected Achievements",
|
||||
["projects"] = "Projects",
|
||||
["education"] = "Education",
|
||||
["certifications"] = "Certifications",
|
||||
["certificates"] = "Certifications",
|
||||
["languages"] = "Languages",
|
||||
["interests"] = "Interests",
|
||||
};
|
||||
|
||||
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
|
||||
|
||||
private readonly UserManager<ApplicationUser> _users;
|
||||
@@ -39,7 +64,6 @@ public sealed class ProfileCvController : ControllerBase
|
||||
|
||||
public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole);
|
||||
public sealed record ParseCvRequest(string? Text);
|
||||
public sealed record ParsedCvSectionDto(string Name, string Content, int WordCount);
|
||||
|
||||
[HttpPost("upload")]
|
||||
[RequestSizeLimit(MaxFileSizeBytes)]
|
||||
@@ -86,16 +110,18 @@ public sealed class ProfileCvController : ControllerBase
|
||||
return BadRequest("The uploaded CV file could not be read or was empty.");
|
||||
}
|
||||
|
||||
text = (await MaybeReconstructStructuredCvAsync(text, HttpContext.RequestAborted)).Trim();
|
||||
var structuredCv = await BuildStructuredCvAsync(text, HttpContext.RequestAborted);
|
||||
|
||||
user.ProfileCvText = text;
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(
|
||||
ParseSections(text).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList());
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { imported = true, characters = text.Length });
|
||||
return Ok(new { imported = true, characters = text.Length, structuredCv, sections = structuredCv.Sections });
|
||||
}
|
||||
|
||||
[HttpPost("rebuild")]
|
||||
@@ -117,15 +143,15 @@ public sealed class ProfileCvController : ControllerBase
|
||||
}
|
||||
|
||||
user.ProfileCvText = rebuilt.Trim();
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(
|
||||
ParseSections(user.ProfileCvText).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList());
|
||||
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText });
|
||||
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
|
||||
}
|
||||
|
||||
[HttpPost("rewrite-section")]
|
||||
@@ -162,18 +188,15 @@ public sealed class ProfileCvController : ControllerBase
|
||||
var source = string.IsNullOrWhiteSpace(request?.Text) ? user.ProfileCvText : request!.Text;
|
||||
if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections.");
|
||||
|
||||
var sections = ParseSections(source)
|
||||
.Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content)))
|
||||
.ToList();
|
||||
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(sections);
|
||||
var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var update = await _users.UpdateAsync(user);
|
||||
if (!update.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", update.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { sections, totalWords = CountWords(source) });
|
||||
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source) });
|
||||
}
|
||||
|
||||
[HttpPost("improve")]
|
||||
@@ -195,15 +218,91 @@ public sealed class ProfileCvController : ControllerBase
|
||||
}
|
||||
|
||||
user.ProfileCvText = improved.Trim();
|
||||
user.ProfileCvStructureJson = JsonSerializer.Serialize(
|
||||
ParseSections(user.ProfileCvText).Select(section => new ParsedCvSectionDto(section.Name, section.Content, CountWords(section.Content))).ToList());
|
||||
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText });
|
||||
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
|
||||
}
|
||||
|
||||
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var fallbackSections = ParseSections(text)
|
||||
.Select(section => new StructuredCvSection
|
||||
{
|
||||
Name = section.Name,
|
||||
Content = section.Content,
|
||||
WordCount = CountWords(section.Content),
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var fallback = StructuredCvProfileJson.FromSections(fallbackSections);
|
||||
fallback.Contact.FullName ??= GuessFullName(text);
|
||||
var extracted = await TryExtractStructuredCvAsync(text, cancellationToken);
|
||||
var merged = StructuredCvProfileJson.Merge(extracted, fallback);
|
||||
merged.Contact.FullName ??= GuessFullName(text);
|
||||
return StructuredCvProfileJson.Normalize(merged);
|
||||
}
|
||||
|
||||
private async Task<StructuredCvProfile?> TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var structuredJson = await _aiService.SummarizeSectionAsync(
|
||||
"Extract this CV into structured JSON. Return only valid JSON with this exact top-level shape: { \"version\": \"1\", \"contact\": { \"fullName\": string|null, \"headline\": string|null, \"email\": string|null, \"phone\": string|null, \"location\": string|null, \"website\": string|null, \"linkedin\": string|null }, \"summary\": string[], \"jobs\": [{ \"title\": string|null, \"company\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"isCurrent\": boolean, \"bullets\": string[], \"skills\": string[] }], \"education\": [{ \"qualification\": string|null, \"institution\": string|null, \"location\": string|null, \"start\": string|null, \"end\": string|null, \"details\": string[] }], \"skills\": string[], \"languages\": [{ \"name\": string|null, \"level\": string|null, \"notes\": string|null }], \"interests\": string[], \"otherSections\": [{ \"title\": string|null, \"items\": string[] }] }. Preserve facts only. Do not invent anything. If a field is unknown, use null or an empty array. Keep wording close to the source. Put unmatched content in otherSections.",
|
||||
text,
|
||||
3200,
|
||||
900);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(structuredJson)) return null;
|
||||
var extracted = ExtractJsonObject(structuredJson);
|
||||
if (string.IsNullOrWhiteSpace(extracted)) return null;
|
||||
|
||||
var parsed = StructuredCvProfileJson.Deserialize(extracted);
|
||||
return IsMeaningfullyStructured(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
private static bool IsMeaningfullyStructured(StructuredCvProfile profile)
|
||||
{
|
||||
return !string.IsNullOrWhiteSpace(profile.Contact.FullName)
|
||||
|| profile.Summary.Count > 0
|
||||
|| profile.Jobs.Count > 0
|
||||
|| profile.Education.Count > 0
|
||||
|| profile.Skills.Count > 0
|
||||
|| profile.Languages.Count > 0
|
||||
|| profile.Interests.Count > 0
|
||||
|| profile.OtherSections.Count > 0;
|
||||
}
|
||||
|
||||
private static string? ExtractJsonObject(string raw)
|
||||
{
|
||||
var trimmed = raw.Trim();
|
||||
if (trimmed.StartsWith("```", StringComparison.Ordinal))
|
||||
{
|
||||
trimmed = Regex.Replace(trimmed, "^```(?:json)?\\s*|\\s*```$", string.Empty, RegexOptions.IgnoreCase);
|
||||
}
|
||||
|
||||
var start = trimmed.IndexOf('{');
|
||||
var end = trimmed.LastIndexOf('}');
|
||||
if (start < 0 || end <= start) return null;
|
||||
return trimmed[start..(end + 1)];
|
||||
}
|
||||
|
||||
private static string? GuessFullName(string source)
|
||||
{
|
||||
var normalized = source.Replace("\r\n", "\n");
|
||||
foreach (var line in normalized.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).Take(6))
|
||||
{
|
||||
var cleaned = line.Trim().TrimStart('#').Trim();
|
||||
if (cleaned.Length < 4 || cleaned.Length > 80) continue;
|
||||
if (cleaned.Contains('@') || Regex.IsMatch(cleaned, @"\d")) continue;
|
||||
if (!Regex.IsMatch(cleaned, @"^[A-Z][A-Za-z'`.-]+(?:\s+[A-Z][A-Za-z'`.-]+){1,4}$")) continue;
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static int CountWords(string? text)
|
||||
@@ -215,25 +314,6 @@ public sealed class ProfileCvController : ControllerBase
|
||||
private static List<(string Name, string Content)> ParseSections(string source)
|
||||
{
|
||||
var lines = source.Replace("\r\n", "\n").Split('\n');
|
||||
var aliases = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
["professional summary"] = "Professional Summary",
|
||||
["summary"] = "Professional Summary",
|
||||
["profile"] = "Professional Summary",
|
||||
["core skills"] = "Core Skills",
|
||||
["skills"] = "Core Skills",
|
||||
["technical skills"] = "Core Skills",
|
||||
["experience"] = "Experience Highlights",
|
||||
["experience highlights"] = "Experience Highlights",
|
||||
["work experience"] = "Experience Highlights",
|
||||
["selected achievements"] = "Selected Achievements",
|
||||
["achievements"] = "Selected Achievements",
|
||||
["projects"] = "Projects",
|
||||
["education"] = "Education",
|
||||
["certifications"] = "Certifications",
|
||||
["certificates"] = "Certifications",
|
||||
};
|
||||
|
||||
var sections = new List<(string Name, List<string> Lines)>();
|
||||
var currentName = "General";
|
||||
var currentLines = new List<string>();
|
||||
@@ -251,16 +331,11 @@ public sealed class ProfileCvController : ControllerBase
|
||||
foreach (var raw in lines)
|
||||
{
|
||||
var line = raw.Trim();
|
||||
var normalized = line.TrimEnd(':').Trim();
|
||||
var looksLikeHeading = normalized.Length > 0
|
||||
&& normalized.Length <= 40
|
||||
&& !normalized.Contains('.')
|
||||
&& aliases.ContainsKey(normalized.ToLowerInvariant());
|
||||
|
||||
if (looksLikeHeading)
|
||||
var canonicalHeading = CanonicalizeSectionHeading(line);
|
||||
if (canonicalHeading is not null)
|
||||
{
|
||||
Flush();
|
||||
currentName = aliases[normalized.ToLowerInvariant()];
|
||||
currentName = canonicalHeading;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -280,6 +355,56 @@ public sealed class ProfileCvController : ControllerBase
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private async Task<string> MaybeReconstructStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var normalized = text.Trim();
|
||||
if (!LooksLikeFlattenedCvExtraction(normalized))
|
||||
{
|
||||
return normalized;
|
||||
}
|
||||
|
||||
var reconstructed = await _aiService.SummarizeSectionAsync(
|
||||
"Reconstruct this CV text extracted from a PDF into a clean, readable master CV in markdown. Preserve facts only. Recover clear sections such as Contact, Professional Summary, Work Experience, Education, Skills, Languages, and Interests when present. Split contact details onto their own lines, turn noisy all-caps/spaced headings into normal headings, keep dates with the correct roles and employers, and remove layout/OCR artifacts. Do not invent employers, titles, dates, or metrics. Return only the reconstructed CV text.",
|
||||
normalized,
|
||||
2800,
|
||||
900);
|
||||
|
||||
return string.IsNullOrWhiteSpace(reconstructed) ? normalized : reconstructed.Trim();
|
||||
}
|
||||
|
||||
private static bool LooksLikeFlattenedCvExtraction(string text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return false;
|
||||
|
||||
var normalized = text.Replace("\r\n", "\n");
|
||||
var lineCount = normalized.Split('\n').Count(line => !string.IsNullOrWhiteSpace(line));
|
||||
var spacedHeadingCount = Regex.Matches(normalized, @"\b(?:[A-Z]\s){3,}[A-Z]\b").Count;
|
||||
var knownHeadingHits = SectionAliases.Keys.Count(alias => normalized.Contains(alias, StringComparison.OrdinalIgnoreCase));
|
||||
var bulletCount = Regex.Matches(normalized, @"[•●▪◦]").Count;
|
||||
|
||||
return (lineCount <= 6 && normalized.Length >= 500)
|
||||
|| spacedHeadingCount >= 3
|
||||
|| (knownHeadingHits >= 3 && lineCount <= 12)
|
||||
|| (normalized.Contains(" + ") && bulletCount > 0 && lineCount <= 10);
|
||||
}
|
||||
|
||||
private static string? CanonicalizeSectionHeading(string line)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(line)) return null;
|
||||
|
||||
var normalized = line.Trim();
|
||||
if (normalized.StartsWith("#", StringComparison.Ordinal))
|
||||
{
|
||||
normalized = normalized.TrimStart('#').Trim();
|
||||
}
|
||||
|
||||
normalized = normalized.TrimEnd(':').Trim();
|
||||
if (normalized.Length == 0 || normalized.Length > 60) return null;
|
||||
if (normalized.Contains('.') || normalized.Contains(" ")) return null;
|
||||
|
||||
return SectionAliases.TryGetValue(normalized, out var canonical) ? canonical : null;
|
||||
}
|
||||
|
||||
private static async Task<string> ExtractTextAsync(IFormFile file, string extension)
|
||||
{
|
||||
if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase))
|
||||
|
||||
Reference in New Issue
Block a user