using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text.Json; using System.Text.RegularExpressions; namespace JobTrackerApi.Services.JobImport; public sealed class UniversalJobParser { private static readonly Regex JsonLdScriptRegex = new(@"]+type\s*=\s*[""']application/ld\+json[""'][^>]*>(?[\s\S]*?)", RegexOptions.IgnoreCase | RegexOptions.Compiled); public JobImportResult Parse(string html, string url) { if (string.IsNullOrWhiteSpace(html)) { return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "Empty HTML." }; } foreach (Match m in JsonLdScriptRegex.Matches(html)) { var json = (m.Groups["json"].Value ?? "").Trim(); if (json.Length == 0) continue; // Some sites embed multiple JSON objects in one script; try best-effort. var candidates = SplitJsonLdPayload(json); foreach (var c in candidates) { if (!TryParseJobPosting(c, url, out var result)) continue; return result with { Parser = "universal", Success = true }; } } return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "No JobPosting schema found." }; } private static IEnumerable SplitJsonLdPayload(string raw) { // Many pages have valid JSON; keep it simple. If parsing fails, try trimming common junk. yield return raw; yield return raw.Trim().TrimEnd(';'); } private static bool TryParseJobPosting(string json, string url, out JobImportResult result) { result = new JobImportResult { SourceUrl = url, Parser = "universal", Success = false }; try { using var doc = JsonDocument.Parse(json); var node = FindJobPostingNode(doc.RootElement); if (node is null) return false; var job = node.Value; var title = GetString(job, "title"); var description = GetString(job, "description"); var company = GetString(job, "hiringOrganization", "name") ?? GetString(job, "hiringOrganization", "legalName"); var location = ExtractLocation(job); var deadline = ParseDateTime(GetString(job, "validThrough")); description = HtmlExtract.ToPlainText(description); result = new JobImportResult { SourceUrl = url, Title = title, Company = company, Location = location, Description = description, Deadline = deadline, Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(description), Parser = "universal" }; return result.Success; } catch { return false; } } private static JsonElement? FindJobPostingNode(JsonElement root) { // Accept: { "@type":"JobPosting", ... } if (IsJobPosting(root)) return root; // Accept: { "@graph":[...]} or arrays. if (root.ValueKind == JsonValueKind.Object) { if (root.TryGetProperty("@graph", out var g) && g.ValueKind == JsonValueKind.Array) { foreach (var el in g.EnumerateArray()) { var found = FindJobPostingNode(el); if (found is not null) return found; } } foreach (var prop in root.EnumerateObject()) { var found = FindJobPostingNode(prop.Value); if (found is not null) return found; } } if (root.ValueKind == JsonValueKind.Array) { foreach (var el in root.EnumerateArray()) { var found = FindJobPostingNode(el); if (found is not null) return found; } } return null; } private static bool IsJobPosting(JsonElement el) { if (el.ValueKind != JsonValueKind.Object) return false; if (!el.TryGetProperty("@type", out var typeEl)) return false; if (typeEl.ValueKind == JsonValueKind.String) { return string.Equals(typeEl.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase); } if (typeEl.ValueKind == JsonValueKind.Array) { foreach (var t in typeEl.EnumerateArray()) { if (t.ValueKind == JsonValueKind.String && string.Equals(t.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase)) { return true; } } } return false; } private static string? GetString(JsonElement el, params string[] path) { var cur = el; for (var i = 0; i < path.Length; i++) { if (cur.ValueKind != JsonValueKind.Object) return null; if (!cur.TryGetProperty(path[i], out var next)) return null; cur = next; } return cur.ValueKind switch { JsonValueKind.String => cur.GetString(), JsonValueKind.Number => cur.ToString(), _ => null }; } private static string? ExtractLocation(JsonElement job) { // jobLocation can be object or array; address fields vary. if (!job.TryGetProperty("jobLocation", out var jl)) return null; var addr = FindFirstAddress(jl); if (addr is null) return null; var city = GetString(addr.Value, "addressLocality"); var region = GetString(addr.Value, "addressRegion"); var country = GetString(addr.Value, "addressCountry"); var parts = new[] { city, region, country }.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); return parts.Length == 0 ? null : string.Join(", ", parts); } private static JsonElement? FindFirstAddress(JsonElement jobLocation) { if (jobLocation.ValueKind == JsonValueKind.Object) { if (jobLocation.TryGetProperty("address", out var a)) { if (a.ValueKind == JsonValueKind.Object) return a; } return null; } if (jobLocation.ValueKind == JsonValueKind.Array) { foreach (var el in jobLocation.EnumerateArray()) { var addr = FindFirstAddress(el); if (addr is not null) return addr; } } return null; } private static DateTime? ParseDateTime(string? raw) { if (string.IsNullOrWhiteSpace(raw)) return null; if (DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal, out var dt)) return dt; return null; } } internal static class HtmlExtract { private static readonly Regex TitleRegex = new(@"]*>(?[\s\S]*?)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex TagRegex = new(@"<[^>]+>", RegexOptions.Compiled); private static readonly Regex WsRegex = new(@"\s+", RegexOptions.Compiled); public static string? ReadTitle(string html) { var m = TitleRegex.Match(html); if (!m.Success) return null; return DecodeHtmlEntities(m.Groups["t"].Value).Trim(); } public static Dictionary ReadMeta(string html) { // Very small meta extractor: picks up OpenGraph + standard meta tags. var dict = new Dictionary(StringComparer.OrdinalIgnoreCase); foreach (Match m in Regex.Matches(html, @"]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled)) { var tag = m.Value; var key = GetAttr(tag, "property") ?? GetAttr(tag, "name"); var content = GetAttr(tag, "content"); if (string.IsNullOrWhiteSpace(key) || string.IsNullOrWhiteSpace(content)) continue; if (!dict.ContainsKey(key)) dict[key] = DecodeHtmlEntities(content).Trim(); } return dict; } public static string? ToPlainText(string? htmlOrText) { if (string.IsNullOrWhiteSpace(htmlOrText)) return null; var s = DecodeHtmlEntities(htmlOrText); s = TagRegex.Replace(s, " "); s = WsRegex.Replace(s, " ").Trim(); return s.Length == 0 ? null : s; } private static string? GetAttr(string tag, string attr) { var m = Regex.Match(tag, attr + @"\s*=\s*(?[""'])(?[\s\S]*?)(\k)", RegexOptions.IgnoreCase | RegexOptions.Compiled); if (m.Success) return m.Groups["v"].Value; // Unquoted attribute values. m = Regex.Match(tag, attr + @"\s*=\s*(?[^\s>]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); return m.Success ? m.Groups["v"].Value : null; } private static string DecodeHtmlEntities(string s) => System.Net.WebUtility.HtmlDecode(s); }