269 lines
9.0 KiB
C#
269 lines
9.0 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Globalization;
|
|
using System.Linq;
|
|
using System.Text.Json;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace JobTrackerApi.Services.JobImport;
|
|
|
|
public sealed class UniversalJobParser
|
|
{
|
|
private static readonly Regex JsonLdScriptRegex =
|
|
new(@"<script[^>]+type\s*=\s*[""']application/ld\+json[""'][^>]*>(?<json>[\s\S]*?)</script>",
|
|
RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
|
|
public JobImportResult Parse(string html, string url)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(html))
|
|
{
|
|
return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "Empty HTML." };
|
|
}
|
|
|
|
foreach (Match m in JsonLdScriptRegex.Matches(html))
|
|
{
|
|
var json = (m.Groups["json"].Value ?? "").Trim();
|
|
if (json.Length == 0) continue;
|
|
|
|
// Some sites embed multiple JSON objects in one script; try best-effort.
|
|
var candidates = SplitJsonLdPayload(json);
|
|
foreach (var c in candidates)
|
|
{
|
|
if (!TryParseJobPosting(c, url, out var result)) continue;
|
|
return result with { Parser = "universal", Success = true };
|
|
}
|
|
}
|
|
|
|
return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "No JobPosting schema found." };
|
|
}
|
|
|
|
private static IEnumerable<string> SplitJsonLdPayload(string raw)
|
|
{
|
|
// Many pages have valid JSON; keep it simple. If parsing fails, try trimming common junk.
|
|
yield return raw;
|
|
yield return raw.Trim().TrimEnd(';');
|
|
}
|
|
|
|
private static bool TryParseJobPosting(string json, string url, out JobImportResult result)
|
|
{
|
|
result = new JobImportResult { SourceUrl = url, Parser = "universal", Success = false };
|
|
|
|
try
|
|
{
|
|
using var doc = JsonDocument.Parse(json);
|
|
var node = FindJobPostingNode(doc.RootElement);
|
|
if (node is null) return false;
|
|
|
|
var job = node.Value;
|
|
var title = GetString(job, "title");
|
|
var description = GetString(job, "description");
|
|
var company = GetString(job, "hiringOrganization", "name")
|
|
?? GetString(job, "hiringOrganization", "legalName");
|
|
var location = ExtractLocation(job);
|
|
var deadline = ParseDateTime(GetString(job, "validThrough"));
|
|
|
|
description = HtmlExtract.ToPlainText(description);
|
|
|
|
result = new JobImportResult
|
|
{
|
|
SourceUrl = url,
|
|
Title = title,
|
|
Company = company,
|
|
Location = location,
|
|
Description = description,
|
|
Deadline = deadline,
|
|
Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(description),
|
|
Parser = "universal"
|
|
};
|
|
return result.Success;
|
|
}
|
|
catch
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
private static JsonElement? FindJobPostingNode(JsonElement root)
|
|
{
|
|
// Accept: { "@type":"JobPosting", ... }
|
|
if (IsJobPosting(root)) return root;
|
|
|
|
// Accept: { "@graph":[...]} or arrays.
|
|
if (root.ValueKind == JsonValueKind.Object)
|
|
{
|
|
if (root.TryGetProperty("@graph", out var g) && g.ValueKind == JsonValueKind.Array)
|
|
{
|
|
foreach (var el in g.EnumerateArray())
|
|
{
|
|
var found = FindJobPostingNode(el);
|
|
if (found is not null) return found;
|
|
}
|
|
}
|
|
|
|
foreach (var prop in root.EnumerateObject())
|
|
{
|
|
var found = FindJobPostingNode(prop.Value);
|
|
if (found is not null) return found;
|
|
}
|
|
}
|
|
|
|
if (root.ValueKind == JsonValueKind.Array)
|
|
{
|
|
foreach (var el in root.EnumerateArray())
|
|
{
|
|
var found = FindJobPostingNode(el);
|
|
if (found is not null) return found;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private static bool IsJobPosting(JsonElement el)
|
|
{
|
|
if (el.ValueKind != JsonValueKind.Object) return false;
|
|
if (!el.TryGetProperty("@type", out var typeEl)) return false;
|
|
|
|
if (typeEl.ValueKind == JsonValueKind.String)
|
|
{
|
|
return string.Equals(typeEl.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase);
|
|
}
|
|
|
|
if (typeEl.ValueKind == JsonValueKind.Array)
|
|
{
|
|
foreach (var t in typeEl.EnumerateArray())
|
|
{
|
|
if (t.ValueKind == JsonValueKind.String &&
|
|
string.Equals(t.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private static string? GetString(JsonElement el, params string[] path)
|
|
{
|
|
var cur = el;
|
|
for (var i = 0; i < path.Length; i++)
|
|
{
|
|
if (cur.ValueKind != JsonValueKind.Object) return null;
|
|
if (!cur.TryGetProperty(path[i], out var next)) return null;
|
|
cur = next;
|
|
}
|
|
|
|
return cur.ValueKind switch
|
|
{
|
|
JsonValueKind.String => cur.GetString(),
|
|
JsonValueKind.Number => cur.ToString(),
|
|
_ => null
|
|
};
|
|
}
|
|
|
|
private static string? ExtractLocation(JsonElement job)
|
|
{
|
|
// jobLocation can be object or array; address fields vary.
|
|
if (!job.TryGetProperty("jobLocation", out var jl)) return null;
|
|
var addr = FindFirstAddress(jl);
|
|
if (addr is null) return null;
|
|
|
|
var city = GetString(addr.Value, "addressLocality");
|
|
var region = GetString(addr.Value, "addressRegion");
|
|
var country = GetString(addr.Value, "addressCountry");
|
|
|
|
var parts = new[] { city, region, country }.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
|
|
return parts.Length == 0 ? null : string.Join(", ", parts);
|
|
}
|
|
|
|
private static JsonElement? FindFirstAddress(JsonElement jobLocation)
|
|
{
|
|
if (jobLocation.ValueKind == JsonValueKind.Object)
|
|
{
|
|
if (jobLocation.TryGetProperty("address", out var a))
|
|
{
|
|
if (a.ValueKind == JsonValueKind.Object) return a;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
if (jobLocation.ValueKind == JsonValueKind.Array)
|
|
{
|
|
foreach (var el in jobLocation.EnumerateArray())
|
|
{
|
|
var addr = FindFirstAddress(el);
|
|
if (addr is not null) return addr;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private static DateTime? ParseDateTime(string? raw)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(raw)) return null;
|
|
if (DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal, out var dt)) return dt;
|
|
return null;
|
|
}
|
|
}
|
|
|
|
internal static class HtmlExtract
|
|
{
|
|
private static readonly Regex TitleRegex =
|
|
new(@"<title[^>]*>(?<t>[\s\S]*?)</title>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
|
|
private static readonly Regex TagRegex =
|
|
new(@"<[^>]+>", RegexOptions.Compiled);
|
|
|
|
private static readonly Regex WsRegex =
|
|
new(@"\s+", RegexOptions.Compiled);
|
|
|
|
public static string? ReadTitle(string html)
|
|
{
|
|
var m = TitleRegex.Match(html);
|
|
if (!m.Success) return null;
|
|
return DecodeHtmlEntities(m.Groups["t"].Value).Trim();
|
|
}
|
|
|
|
public static Dictionary<string, string> ReadMeta(string html)
|
|
{
|
|
// Very small meta extractor: picks up OpenGraph + standard meta tags.
|
|
var dict = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
|
|
|
foreach (Match m in Regex.Matches(html, @"<meta\s+[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled))
|
|
{
|
|
var tag = m.Value;
|
|
var key = GetAttr(tag, "property") ?? GetAttr(tag, "name");
|
|
var content = GetAttr(tag, "content");
|
|
if (string.IsNullOrWhiteSpace(key) || string.IsNullOrWhiteSpace(content)) continue;
|
|
if (!dict.ContainsKey(key)) dict[key] = DecodeHtmlEntities(content).Trim();
|
|
}
|
|
|
|
return dict;
|
|
}
|
|
|
|
public static string? ToPlainText(string? htmlOrText)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(htmlOrText)) return null;
|
|
var s = DecodeHtmlEntities(htmlOrText);
|
|
s = TagRegex.Replace(s, " ");
|
|
s = WsRegex.Replace(s, " ").Trim();
|
|
return s.Length == 0 ? null : s;
|
|
}
|
|
|
|
private static string? GetAttr(string tag, string attr)
|
|
{
|
|
var m = Regex.Match(tag, attr + @"\s*=\s*(?<q>[""'])(?<v>[\s\S]*?)(\k<q>)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
if (m.Success) return m.Groups["v"].Value;
|
|
|
|
// Unquoted attribute values.
|
|
m = Regex.Match(tag, attr + @"\s*=\s*(?<v>[^\s>]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
return m.Success ? m.Groups["v"].Value : null;
|
|
}
|
|
|
|
private static string DecodeHtmlEntities(string s)
|
|
=> System.Net.WebUtility.HtmlDecode(s);
|
|
}
|
|
|