Improve structured CV fallback extraction

2026-03-28 22:56:55 +01:00
parent 3b6588397e
commit d8ab312f59
3 changed files with 325 additions and 9 deletions
@@ -231,7 +231,8 @@ public sealed class ProfileCvController : ControllerBase

    private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
    {
-        var fallbackSections = ParseSections(text)
+        var parseSource = NormalizeTextForStructuredParsing(text);
+        var fallbackSections = ParseSections(parseSource)
            .Select(section => new StructuredCvSection
            {
                Name = section.Name,
@@ -240,11 +241,14 @@ public sealed class ProfileCvController : ControllerBase
            })
            .ToList();

-        var fallback = StructuredCvProfileJson.FromSections(fallbackSections);
-        fallback.Contact.FullName ??= GuessFullName(text);
-        var extracted = await TryExtractStructuredCvAsync(text, cancellationToken);
+        var sectionFallback = StructuredCvProfileJson.FromSections(fallbackSections);
+        var heuristicFallback = BuildHeuristicStructuredCv(parseSource, text);
+        heuristicFallback.Sections = new List<StructuredCvSection>();
+        var fallback = StructuredCvProfileJson.Merge(heuristicFallback, sectionFallback);
+        fallback.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(fallback.Contact.Email);
+        var extracted = await TryExtractStructuredCvAsync(parseSource, cancellationToken);
        var merged = StructuredCvProfileJson.Merge(extracted, fallback);
-        merged.Contact.FullName ??= GuessFullName(text);
+        merged.Contact.FullName ??= GuessFullName(text) ?? GuessFullNameFromEmail(merged.Contact.Email);
        return StructuredCvProfileJson.Normalize(merged);
    }

@@ -305,12 +309,265 @@ public sealed class ProfileCvController : ControllerBase
        return null;
    }

+    private static string? GuessFullNameFromEmail(string? email)
+    {
+        if (string.IsNullOrWhiteSpace(email) || !email.Contains('@')) return null;
+        var localPart = email[..email.IndexOf('@')].Trim();
+        if (string.IsNullOrWhiteSpace(localPart)) return null;
+        var parts = Regex.Split(localPart, @"[._-]+")
+            .Select(part => part.Trim())
+            .Where(part => part.Length > 0)
+            .Select(part => char.ToUpperInvariant(part[0]) + part[1..].ToLowerInvariant())
+            .ToList();
+        return parts.Count >= 2 ? string.Join(" ", parts) : null;
+    }
+
+    private static string NormalizeTextForStructuredParsing(string source)
+    {
+        if (string.IsNullOrWhiteSpace(source)) return string.Empty;
+
+        var text = source.Replace("\r\n", "\n").Trim();
+        if (!LooksLikeFlattenedCvExtraction(text)) return text;
+
+        text = Regex.Replace(text, @"\b([A-Z](?:\s+[A-Z]){2,})\b", match =>
+        {
+            var collapsed = Regex.Replace(match.Value, @"\s+", string.Empty);
+            foreach (var alias in SectionAliases)
+            {
+                var aliasLettersOnly = Regex.Replace(alias.Key, @"[^A-Za-z]", string.Empty);
+                if (collapsed.Equals(aliasLettersOnly, StringComparison.OrdinalIgnoreCase))
+                {
+                    return $"\n\n## {alias.Value}\n";
+                }
+            }
+
+            return match.Value;
+        });
+
+        foreach (var alias in SectionAliases.OrderByDescending(pair => pair.Key.Length))
+        {
+            text = Regex.Replace(
+                text,
+                $@"(?<!#)\b{Regex.Escape(alias.Key)}\b",
+                $"\n\n## {alias.Value}\n",
+                RegexOptions.IgnoreCase);
+        }
+
+        text = Regex.Replace(text, @"\s+\+\s+", "\n+ ");
+        text = Regex.Replace(text, @"\s*([•●▪◦])\s*", "\n- ");
+        text = Regex.Replace(text, @"\s+(\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))\b", "\n$1\n", RegexOptions.IgnoreCase);
+        text = Regex.Replace(text, @"\n{3,}", "\n\n");
+
+        return text.Trim();
+    }
+
+    private static StructuredCvProfile BuildHeuristicStructuredCv(string parseSource, string rawSource)
+    {
+        var profile = new StructuredCvProfile();
+        var normalized = parseSource.Replace("\r\n", "\n").Trim();
+
+        profile.Contact.Email = NullIfWhitespace(Regex.Match(rawSource, @"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", RegexOptions.IgnoreCase).Value);
+        profile.Contact.Phone = NullIfWhitespace(Regex.Match(rawSource, @"(?<!\w)(?:\+?\d[\d\s().-]{6,}\d)", RegexOptions.IgnoreCase).Value);
+        profile.Contact.Website = NullIfWhitespace(Regex.Match(rawSource, @"\b(?:https?://)?(?:www\.)?[A-Z0-9.-]+\.[A-Z]{2,}(?:/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]*)?", RegexOptions.IgnoreCase).Value);
+        profile.Contact.LinkedIn = NullIfWhitespace(Regex.Match(rawSource, @"(?:linkedin(?:\.com)?/[A-Z0-9._~:/?#\[\]@!$&'()*+,;=-]+)", RegexOptions.IgnoreCase).Value);
+        profile.Contact.FullName = GuessFullName(rawSource) ?? GuessFullNameFromEmail(profile.Contact.Email);
+        profile.Contact.Location = NullIfWhitespace(Regex.Match(rawSource, @"\b[A-Z][a-z]+(?:[\s-][A-Z][a-z]+)*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b").Value);
+
+        var sections = ParseSections(normalized);
+        var summarySection = sections.FirstOrDefault(section => section.Name == "Professional Summary");
+        var flattenedSummary = Regex.Match(
+            rawSource,
+            @"(?:A\s+B\s+O\s+U\s+T\s+M\s+E|P\s+R\s+O\s+F\s+I\s+L\s+E|S\s+U\s+M\s+M\s+A\s+R\s+Y)\s*(?<body>.*?)(?=(?:I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S|E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|C\s+O\s+N\s+T\s+A\s+C\s+T|$))",
+            RegexOptions.IgnoreCase | RegexOptions.Singleline);
+        if (flattenedSummary.Success)
+        {
+            profile.Summary = SplitSentences(flattenedSummary.Groups["body"].Value, 5);
+        }
+        else if (!string.IsNullOrWhiteSpace(summarySection.Content))
+        {
+            profile.Summary = SplitSentences(summarySection.Content, 5);
+        }
+
+        var interestsSection = sections.FirstOrDefault(section => section.Name == "Interests");
+        if (!string.IsNullOrWhiteSpace(interestsSection.Content))
+        {
+            profile.Interests = SplitListLike(interestsSection.Content);
+        }
+        else
+        {
+            var flattenedInterests = Regex.Match(
+                rawSource,
+                @"I\s+N\s+T\s+E\s+R\s+E\s+S\s+T\s+S\s*(?<body>.*?)(?=(?:E\s+X\s+P\s+E\s+R\s+I\s+E\s+N\s+C\s+E|C\s+O\s+N\s+T\s+A\s+C\s+T|E\s+D\s+U\s+C\s+A\s+T\s+I\s+O\s+N|$))",
+                RegexOptions.IgnoreCase | RegexOptions.Singleline);
+            if (flattenedInterests.Success)
+            {
+                profile.Interests = SplitSentences(flattenedInterests.Groups["body"].Value, 4);
+            }
+        }
+
+        var languagesSection = sections.FirstOrDefault(section => section.Name == "Languages");
+        if (!string.IsNullOrWhiteSpace(languagesSection.Content))
+        {
+            profile.Languages = ParseLanguagesHeuristically(languagesSection.Content);
+        }
+        else
+        {
+            profile.Languages = ParseLanguagesHeuristically(rawSource);
+        }
+
+        var skills = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
+        foreach (Match match in Regex.Matches(rawSource, @"(?<![A-Za-z0-9])(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)(?![A-Za-z0-9])", RegexOptions.IgnoreCase))
+        {
+            skills.Add(match.Value.Trim());
+        }
+        profile.Skills = skills.ToList();
+
+        var educationSection = sections.FirstOrDefault(section => section.Name == "Education");
+        if (!string.IsNullOrWhiteSpace(educationSection.Content))
+        {
+            profile.Education = ParseEducationHeuristically(educationSection.Content);
+        }
+
+        var experienceSection = sections.FirstOrDefault(section => section.Name == "Work Experience");
+        if (!string.IsNullOrWhiteSpace(experienceSection.Content))
+        {
+            profile.Jobs = ParseJobsHeuristically(experienceSection.Content);
+        }
+
+        if (profile.OtherSections.Count == 0 && sections.Any(section => section.Name == "General"))
+        {
+            var general = sections.First(section => section.Name == "General");
+            if (!string.IsNullOrWhiteSpace(general.Content) && profile.Summary.Count == 0)
+            {
+                profile.Summary = SplitSentences(general.Content, 3);
+            }
+        }
+
+        return StructuredCvProfileJson.Normalize(profile);
+    }
+
+    private static List<string> SplitSentences(string content, int limit)
+    {
+        return Regex.Split(content.Replace("\r\n", " "), @"(?<=[.!?])\s+")
+            .Select(value => value.Trim())
+            .Where(value => value.Length > 20)
+            .Take(limit)
+            .ToList();
+    }
+
+    private static List<string> SplitListLike(string content)
+    {
+        return content
+            .Replace("\r\n", "\n")
+            .Split(new[] { '\n', ',', ';' }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
+            .Select(item => item.Trim().TrimStart('-', '•', '*', ' '))
+            .Where(item => item.Length > 1)
+            .Distinct(StringComparer.OrdinalIgnoreCase)
+            .ToList();
+    }
+
+    private static List<StructuredCvLanguage> ParseLanguagesHeuristically(string content)
+    {
+        var languages = new List<StructuredCvLanguage>();
+        foreach (Match match in Regex.Matches(content, @"\b(English|Norwegian|Norsk|German|French|Spanish|Swedish|Danish)\b(?:[^\n.,;:]*?\b(Native|Fluent|Advanced|Intermediate|Beginner|A1|A2|B1|B2|C1|C2|Native speaker)\b)?", RegexOptions.IgnoreCase))
+        {
+            var name = NullIfWhitespace(match.Groups[1].Value);
+            var level = NullIfWhitespace(match.Groups[2].Value);
+            if (name is null) continue;
+            languages.Add(new StructuredCvLanguage { Name = name, Level = level });
+        }
+
+        return languages
+            .GroupBy(language => language.Name, StringComparer.OrdinalIgnoreCase)
+            .Select(group => group.First())
+            .ToList();
+    }
+
+    private static List<StructuredCvEducation> ParseEducationHeuristically(string content)
+    {
+        var blocks = Regex.Split(content, @"\n\s*\n")
+            .Select(block => block.Trim())
+            .Where(block => block.Length > 0)
+            .ToList();
+
+        var items = new List<StructuredCvEducation>();
+        foreach (var block in blocks)
+        {
+            var lines = block.Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
+            if (lines.Count == 0) continue;
+
+            var dateMatch = Regex.Match(block, @"\b(\d{4})\s*[-–]\s*(\d{4}|Present|Current)\b", RegexOptions.IgnoreCase);
+            var institutionLine = lines.FirstOrDefault(line => line.StartsWith("+ ", StringComparison.Ordinal))?.TrimStart('+', ' ');
+            var qualificationLine = lines.FirstOrDefault(line => !line.StartsWith("+ ", StringComparison.Ordinal) && !Regex.IsMatch(line, @"^\d{4}\s*[-–]"));
+            if (qualificationLine is null && lines.Count > 0) qualificationLine = lines[0];
+
+            if (qualificationLine is null && institutionLine is null) continue;
+            items.Add(new StructuredCvEducation
+            {
+                Qualification = TitleCasePreservingAcronyms(qualificationLine),
+                Institution = TitleCasePreservingAcronyms(institutionLine),
+                Start = dateMatch.Success ? dateMatch.Groups[1].Value : null,
+                End = dateMatch.Success ? dateMatch.Groups[2].Value : null,
+                Details = lines.Where(line => line.StartsWith("- ", StringComparison.Ordinal)).Select(line => line[2..].Trim()).ToList(),
+            });
+        }
+
+        return items;
+    }
+
+    private static List<StructuredCvJob> ParseJobsHeuristically(string content)
+    {
+        var normalized = content.Replace("\r\n", "\n");
+        var pattern = new Regex(@"(?<title>[A-Z][A-Z\s/&-]{3,})\s*\n(?<dates>\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))(?<body>.*?)(?=(?:\n[A-Z][A-Z\s/&-]{3,}\s*\n\d{4}\s*[-–]\s*(?:\d{4}|Present|Current))|\z)", RegexOptions.Singleline);
+        var jobs = new List<StructuredCvJob>();
+
+        foreach (Match match in pattern.Matches(normalized))
+        {
+            var body = match.Groups["body"].Value.Trim();
+            var employer = NullIfWhitespace(Regex.Match(body, @"\+\s*([^\n]+)").Groups[1].Value);
+            var dates = Regex.Split(match.Groups["dates"].Value, @"\s*[-–]\s*");
+            var bullets = SplitSentences(Regex.Replace(body, @"\+\s*[^\n]+", string.Empty), 6);
+
+            jobs.Add(new StructuredCvJob
+            {
+                Title = TitleCasePreservingAcronyms(match.Groups["title"].Value),
+                Company = employer,
+                Start = NullIfWhitespace(dates.FirstOrDefault()),
+                End = NullIfWhitespace(dates.Skip(1).FirstOrDefault()),
+                IsCurrent = string.Equals(dates.Skip(1).FirstOrDefault(), "present", StringComparison.OrdinalIgnoreCase) || string.Equals(dates.Skip(1).FirstOrDefault(), "current", StringComparison.OrdinalIgnoreCase),
+                Bullets = bullets,
+                Skills = bullets.SelectMany(SplitListLike).Where(item => Regex.IsMatch(item, @"^(?:C#|\.NET|ASP\.NET|SQL|JavaScript|TypeScript|Python|Ruby on Rails|Ruby|React|Azure DevOps|GitHub|CI/CD)$", RegexOptions.IgnoreCase)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(),
+            });
+        }
+
+        return jobs;
+    }
+
+    private static string? TitleCasePreservingAcronyms(string? value)
+    {
+        if (string.IsNullOrWhiteSpace(value)) return null;
+
+        var words = value.Trim()
+            .Split(' ', StringSplitOptions.RemoveEmptyEntries)
+            .Select(word => word.Length <= 3 && word.All(char.IsUpper)
+                ? word
+                : char.ToUpperInvariant(word[0]) + word[1..].ToLowerInvariant())
+            .ToArray();
+
+        return string.Join(" ", words);
+    }
+
    private static int CountWords(string? text)
    {
        if (string.IsNullOrWhiteSpace(text)) return 0;
        return text.Trim().Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries).Length;
    }

+    private static string? NullIfWhitespace(string? value)
+    {
+        return string.IsNullOrWhiteSpace(value) ? null : value.Trim();
+    }
+
    private static List<(string Name, string Content)> ParseSections(string source)
    {
        var lines = source.Replace("\r\n", "\n").Split('\n');