Files
2026-03-21 11:55:27 +01:00

269 lines
9.0 KiB
C#

using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text.Json;
using System.Text.RegularExpressions;
namespace JobTrackerApi.Services.JobImport;
public sealed class UniversalJobParser
{
private static readonly Regex JsonLdScriptRegex =
new(@"<script[^>]+type\s*=\s*[""']application/ld\+json[""'][^>]*>(?<json>[\s\S]*?)</script>",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
public JobImportResult Parse(string html, string url)
{
if (string.IsNullOrWhiteSpace(html))
{
return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "Empty HTML." };
}
foreach (Match m in JsonLdScriptRegex.Matches(html))
{
var json = (m.Groups["json"].Value ?? "").Trim();
if (json.Length == 0) continue;
// Some sites embed multiple JSON objects in one script; try best-effort.
var candidates = SplitJsonLdPayload(json);
foreach (var c in candidates)
{
if (!TryParseJobPosting(c, url, out var result)) continue;
return result with { Parser = "universal", Success = true };
}
}
return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "No JobPosting schema found." };
}
private static IEnumerable<string> SplitJsonLdPayload(string raw)
{
// Many pages have valid JSON; keep it simple. If parsing fails, try trimming common junk.
yield return raw;
yield return raw.Trim().TrimEnd(';');
}
private static bool TryParseJobPosting(string json, string url, out JobImportResult result)
{
result = new JobImportResult { SourceUrl = url, Parser = "universal", Success = false };
try
{
using var doc = JsonDocument.Parse(json);
var node = FindJobPostingNode(doc.RootElement);
if (node is null) return false;
var job = node.Value;
var title = GetString(job, "title");
var description = GetString(job, "description");
var company = GetString(job, "hiringOrganization", "name")
?? GetString(job, "hiringOrganization", "legalName");
var location = ExtractLocation(job);
var deadline = ParseDateTime(GetString(job, "validThrough"));
description = HtmlExtract.ToPlainText(description);
result = new JobImportResult
{
SourceUrl = url,
Title = title,
Company = company,
Location = location,
Description = description,
Deadline = deadline,
Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(description),
Parser = "universal"
};
return result.Success;
}
catch
{
return false;
}
}
private static JsonElement? FindJobPostingNode(JsonElement root)
{
// Accept: { "@type":"JobPosting", ... }
if (IsJobPosting(root)) return root;
// Accept: { "@graph":[...]} or arrays.
if (root.ValueKind == JsonValueKind.Object)
{
if (root.TryGetProperty("@graph", out var g) && g.ValueKind == JsonValueKind.Array)
{
foreach (var el in g.EnumerateArray())
{
var found = FindJobPostingNode(el);
if (found is not null) return found;
}
}
foreach (var prop in root.EnumerateObject())
{
var found = FindJobPostingNode(prop.Value);
if (found is not null) return found;
}
}
if (root.ValueKind == JsonValueKind.Array)
{
foreach (var el in root.EnumerateArray())
{
var found = FindJobPostingNode(el);
if (found is not null) return found;
}
}
return null;
}
private static bool IsJobPosting(JsonElement el)
{
if (el.ValueKind != JsonValueKind.Object) return false;
if (!el.TryGetProperty("@type", out var typeEl)) return false;
if (typeEl.ValueKind == JsonValueKind.String)
{
return string.Equals(typeEl.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase);
}
if (typeEl.ValueKind == JsonValueKind.Array)
{
foreach (var t in typeEl.EnumerateArray())
{
if (t.ValueKind == JsonValueKind.String &&
string.Equals(t.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase))
{
return true;
}
}
}
return false;
}
private static string? GetString(JsonElement el, params string[] path)
{
var cur = el;
for (var i = 0; i < path.Length; i++)
{
if (cur.ValueKind != JsonValueKind.Object) return null;
if (!cur.TryGetProperty(path[i], out var next)) return null;
cur = next;
}
return cur.ValueKind switch
{
JsonValueKind.String => cur.GetString(),
JsonValueKind.Number => cur.ToString(),
_ => null
};
}
private static string? ExtractLocation(JsonElement job)
{
// jobLocation can be object or array; address fields vary.
if (!job.TryGetProperty("jobLocation", out var jl)) return null;
var addr = FindFirstAddress(jl);
if (addr is null) return null;
var city = GetString(addr.Value, "addressLocality");
var region = GetString(addr.Value, "addressRegion");
var country = GetString(addr.Value, "addressCountry");
var parts = new[] { city, region, country }.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
return parts.Length == 0 ? null : string.Join(", ", parts);
}
private static JsonElement? FindFirstAddress(JsonElement jobLocation)
{
if (jobLocation.ValueKind == JsonValueKind.Object)
{
if (jobLocation.TryGetProperty("address", out var a))
{
if (a.ValueKind == JsonValueKind.Object) return a;
}
return null;
}
if (jobLocation.ValueKind == JsonValueKind.Array)
{
foreach (var el in jobLocation.EnumerateArray())
{
var addr = FindFirstAddress(el);
if (addr is not null) return addr;
}
}
return null;
}
private static DateTime? ParseDateTime(string? raw)
{
if (string.IsNullOrWhiteSpace(raw)) return null;
if (DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal, out var dt)) return dt;
return null;
}
}
internal static class HtmlExtract
{
private static readonly Regex TitleRegex =
new(@"<title[^>]*>(?<t>[\s\S]*?)</title>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex TagRegex =
new(@"<[^>]+>", RegexOptions.Compiled);
private static readonly Regex WsRegex =
new(@"\s+", RegexOptions.Compiled);
public static string? ReadTitle(string html)
{
var m = TitleRegex.Match(html);
if (!m.Success) return null;
return DecodeHtmlEntities(m.Groups["t"].Value).Trim();
}
public static Dictionary<string, string> ReadMeta(string html)
{
// Very small meta extractor: picks up OpenGraph + standard meta tags.
var dict = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
foreach (Match m in Regex.Matches(html, @"<meta\s+[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled))
{
var tag = m.Value;
var key = GetAttr(tag, "property") ?? GetAttr(tag, "name");
var content = GetAttr(tag, "content");
if (string.IsNullOrWhiteSpace(key) || string.IsNullOrWhiteSpace(content)) continue;
if (!dict.ContainsKey(key)) dict[key] = DecodeHtmlEntities(content).Trim();
}
return dict;
}
public static string? ToPlainText(string? htmlOrText)
{
if (string.IsNullOrWhiteSpace(htmlOrText)) return null;
var s = DecodeHtmlEntities(htmlOrText);
s = TagRegex.Replace(s, " ");
s = WsRegex.Replace(s, " ").Trim();
return s.Length == 0 ? null : s;
}
private static string? GetAttr(string tag, string attr)
{
var m = Regex.Match(tag, attr + @"\s*=\s*(?<q>[""'])(?<v>[\s\S]*?)(\k<q>)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (m.Success) return m.Groups["v"].Value;
// Unquoted attribute values.
m = Regex.Match(tag, attr + @"\s*=\s*(?<v>[^\s>]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
return m.Success ? m.Groups["v"].Value : null;
}
private static string DecodeHtmlEntities(string s)
=> System.Net.WebUtility.HtmlDecode(s);
}