First Commit
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
namespace JobTrackerApi.Services.JobImport;
|
||||
|
||||
public interface IJobSitePlugin
|
||||
{
|
||||
bool CanHandle(string url);
|
||||
JobImportResult Parse(string html, string url);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
using System;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport;
|
||||
|
||||
public sealed record JobImportResult
|
||||
{
|
||||
public string? Title { get; init; }
|
||||
public string? Company { get; init; }
|
||||
public string? Location { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public string? TranslatedDescription { get; init; }
|
||||
public string? Language { get; init; } // ISO-ish, e.g. "en", "no"
|
||||
public string[] Tags { get; init; } = Array.Empty<string>();
|
||||
public string SourceUrl { get; init; } = "";
|
||||
public DateTime? Deadline { get; init; }
|
||||
|
||||
public bool Success { get; init; }
|
||||
public string? Parser { get; init; } // "universal", "finn", ...
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Net.Http;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using JobTrackerApi.Services.JobImport.Translation;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport;
|
||||
|
||||
public sealed class JobImportService
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly UniversalJobParser _universal;
|
||||
private readonly IEnumerable<IJobSitePlugin> _plugins;
|
||||
private readonly ITranslationService _translation;
|
||||
|
||||
public JobImportService(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
UniversalJobParser universal,
|
||||
IEnumerable<IJobSitePlugin> plugins,
|
||||
ITranslationService translation)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_universal = universal;
|
||||
_plugins = plugins;
|
||||
_translation = translation;
|
||||
}
|
||||
|
||||
public async Task<JobImportResult> PreviewAsync(string url, CancellationToken cancellationToken)
|
||||
{
|
||||
if (!TryValidateUrl(url, out var normalized, out var error))
|
||||
{
|
||||
return new JobImportResult
|
||||
{
|
||||
SourceUrl = url ?? "",
|
||||
Success = false,
|
||||
Parser = "none",
|
||||
Error = error
|
||||
};
|
||||
}
|
||||
|
||||
var html = await FetchHtmlAsync(normalized, cancellationToken);
|
||||
if (html is null)
|
||||
{
|
||||
return new JobImportResult
|
||||
{
|
||||
SourceUrl = normalized,
|
||||
Success = false,
|
||||
Parser = "fetch",
|
||||
Error = "Failed to fetch HTML."
|
||||
};
|
||||
}
|
||||
|
||||
var parsed = _universal.Parse(html, normalized);
|
||||
if (!parsed.Success)
|
||||
{
|
||||
foreach (var plugin in _plugins.Where(p => p.CanHandle(normalized)))
|
||||
{
|
||||
try
|
||||
{
|
||||
var p = plugin.Parse(html, normalized);
|
||||
if (p.Success)
|
||||
{
|
||||
parsed = p;
|
||||
break;
|
||||
}
|
||||
parsed = p; // keep last failure for debugging
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
parsed = new JobImportResult
|
||||
{
|
||||
SourceUrl = normalized,
|
||||
Success = false,
|
||||
Parser = plugin.GetType().Name,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!parsed.Success) return parsed with { SourceUrl = normalized };
|
||||
|
||||
var lang = LanguageDetector.Detect(parsed.Description);
|
||||
var tags = SkillTagger.Detect(parsed.Description);
|
||||
string? translated = null;
|
||||
if (string.Equals(lang, "no", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(parsed.Description))
|
||||
{
|
||||
translated = await _translation.TranslateToEnglishAsync(parsed.Description!, "no", cancellationToken);
|
||||
}
|
||||
|
||||
return parsed with
|
||||
{
|
||||
SourceUrl = normalized,
|
||||
Language = lang,
|
||||
Tags = tags,
|
||||
TranslatedDescription = translated
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<string?> FetchHtmlAsync(string url, CancellationToken cancellationToken)
|
||||
{
|
||||
using var client = _httpClientFactory.CreateClient("jobimport");
|
||||
using var req = new HttpRequestMessage(HttpMethod.Get, url);
|
||||
req.Headers.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) JobTracker/1.0");
|
||||
req.Headers.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
||||
req.Headers.TryAddWithoutValidation("Accept-Language", "en-US,en;q=0.8,no;q=0.6,nb;q=0.6");
|
||||
|
||||
using var res = await client.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
|
||||
if ((int)res.StatusCode >= 300 && (int)res.StatusCode < 400) return null; // avoid redirect chains to non-html.
|
||||
if (!res.IsSuccessStatusCode) return null;
|
||||
|
||||
var ct = res.Content.Headers.ContentType?.MediaType ?? "";
|
||||
if (ct.Length > 0 && !ct.Contains("html", StringComparison.OrdinalIgnoreCase) && !ct.Contains("xml", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
// Still read: many sites omit content-type. Best-effort.
|
||||
}
|
||||
|
||||
// Cap to avoid huge downloads.
|
||||
var bytes = await res.Content.ReadAsByteArrayAsync(cancellationToken);
|
||||
if (bytes.Length > 4_000_000) return null;
|
||||
return System.Text.Encoding.UTF8.GetString(bytes);
|
||||
}
|
||||
|
||||
private static bool TryValidateUrl(string? url, out string normalized, out string error)
|
||||
{
|
||||
normalized = "";
|
||||
error = "";
|
||||
if (string.IsNullOrWhiteSpace(url))
|
||||
{
|
||||
error = "URL is required.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!Uri.TryCreate(url.Trim(), UriKind.Absolute, out var uri))
|
||||
{
|
||||
error = "Invalid URL.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (uri.Scheme is not ("http" or "https"))
|
||||
{
|
||||
error = "Only http/https URLs are supported.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (uri.IsLoopback || string.Equals(uri.Host, "localhost", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
error = "Local URLs are not allowed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Block literal private IPs.
|
||||
if (IPAddress.TryParse(uri.Host, out var ip))
|
||||
{
|
||||
if (IsPrivateIp(ip))
|
||||
{
|
||||
error = "Private IP URLs are not allowed.";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
normalized = uri.ToString();
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool IsPrivateIp(IPAddress ip)
|
||||
{
|
||||
if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetwork)
|
||||
{
|
||||
var b = ip.GetAddressBytes();
|
||||
return b[0] == 10 ||
|
||||
(b[0] == 172 && b[1] >= 16 && b[1] <= 31) ||
|
||||
(b[0] == 192 && b[1] == 168) ||
|
||||
(b[0] == 169 && b[1] == 254);
|
||||
}
|
||||
if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetworkV6)
|
||||
{
|
||||
return ip.IsIPv6LinkLocal || ip.IsIPv6SiteLocal;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
using System;
|
||||
using System.Globalization;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport;
|
||||
|
||||
public static class LanguageDetector
|
||||
{
|
||||
// Lightweight heuristic: good enough to distinguish Norwegian vs English for job ads.
|
||||
public static string Detect(string? text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return "en";
|
||||
var t = text.AsSpan();
|
||||
|
||||
// Norwegian characters strongly indicate "no".
|
||||
for (var i = 0; i < t.Length; i++)
|
||||
{
|
||||
var ch = char.ToLowerInvariant(t[i]);
|
||||
if (ch is 'æ' or 'ø' or 'å') return "no";
|
||||
}
|
||||
|
||||
var lower = text.ToLower(CultureInfo.InvariantCulture);
|
||||
var hits = 0;
|
||||
hits += lower.Contains(" stilling ") || lower.Contains(" stillingen ") ? 2 : 0;
|
||||
hits += lower.Contains(" søker ") || lower.Contains(" s\u00F8ker ") ? 2 : 0;
|
||||
hits += lower.Contains(" arbeidsoppgaver") ? 2 : 0;
|
||||
hits += lower.Contains(" kvalifikasjoner") ? 2 : 0;
|
||||
hits += lower.Contains(" vi tilbyr") ? 2 : 0;
|
||||
hits += lower.Contains(" krav ") ? 1 : 0;
|
||||
hits += lower.Contains(" og ") ? 1 : 0;
|
||||
hits += lower.Contains(" ikke ") ? 1 : 0;
|
||||
hits += lower.Contains(" du ") || lower.Contains(" deg ") ? 1 : 0;
|
||||
|
||||
return hits >= 4 ? "no" : "en";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
using System;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport.Plugins;
|
||||
|
||||
public sealed class FinnPlugin : IJobSitePlugin
|
||||
{
|
||||
public bool CanHandle(string url) => url.Contains("finn.no", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
public JobImportResult Parse(string html, string url)
|
||||
{
|
||||
var meta = HtmlExtract.ReadMeta(html);
|
||||
var title = meta.TryGetValue("og:title", out var t) ? t : HtmlExtract.ReadTitle(html);
|
||||
var desc = meta.TryGetValue("og:description", out var d) ? d : null;
|
||||
var company = ExtractCompanyFromTitle(title);
|
||||
|
||||
return new JobImportResult
|
||||
{
|
||||
SourceUrl = url,
|
||||
Title = CleanTitle(title),
|
||||
Company = company,
|
||||
Location = meta.TryGetValue("job:location", out var loc) ? loc : null,
|
||||
Description = HtmlExtract.ToPlainText(desc),
|
||||
Parser = "finn",
|
||||
Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(desc),
|
||||
};
|
||||
}
|
||||
|
||||
private static string? CleanTitle(string? title)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(title)) return null;
|
||||
// FINN often appends " - FINN.no" etc.
|
||||
var s = title.Replace(" - FINN.no", "", StringComparison.OrdinalIgnoreCase).Trim();
|
||||
return s.Length == 0 ? title : s;
|
||||
}
|
||||
|
||||
private static string? ExtractCompanyFromTitle(string? title)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(title)) return null;
|
||||
// Common pattern: "Role hos Company" / "Role - Company"
|
||||
var s = title;
|
||||
var idx = s.LastIndexOf(" - ", StringComparison.Ordinal);
|
||||
if (idx > 0 && idx < s.Length - 3) return s[(idx + 3)..].Trim();
|
||||
idx = s.LastIndexOf(" hos ", StringComparison.OrdinalIgnoreCase);
|
||||
if (idx > 0 && idx < s.Length - 5) return s[(idx + 5)..].Trim();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
using System;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport.Plugins;
|
||||
|
||||
public sealed class JobbnorgePlugin : IJobSitePlugin
|
||||
{
|
||||
public bool CanHandle(string url) => url.Contains("jobbnorge.no", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
public JobImportResult Parse(string html, string url)
|
||||
{
|
||||
var meta = HtmlExtract.ReadMeta(html);
|
||||
var title = meta.TryGetValue("og:title", out var t) ? t : HtmlExtract.ReadTitle(html);
|
||||
var desc = meta.TryGetValue("og:description", out var d) ? d : null;
|
||||
|
||||
return new JobImportResult
|
||||
{
|
||||
SourceUrl = url,
|
||||
Title = title,
|
||||
Description = HtmlExtract.ToPlainText(desc),
|
||||
Parser = "jobbnorge",
|
||||
Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(desc),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
using System;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport.Plugins;
|
||||
|
||||
public sealed class LinkedInPlugin : IJobSitePlugin
|
||||
{
|
||||
public bool CanHandle(string url) => url.Contains("linkedin.com/jobs", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
public JobImportResult Parse(string html, string url)
|
||||
{
|
||||
// LinkedIn heavily relies on JS; meta tags are often the best available without a headless browser.
|
||||
var meta = HtmlExtract.ReadMeta(html);
|
||||
var title = meta.TryGetValue("og:title", out var t) ? t : HtmlExtract.ReadTitle(html);
|
||||
var desc = meta.TryGetValue("og:description", out var d) ? d : null;
|
||||
|
||||
return new JobImportResult
|
||||
{
|
||||
SourceUrl = url,
|
||||
Title = title,
|
||||
Company = meta.TryGetValue("og:site_name", out var sn) ? sn : null,
|
||||
Description = HtmlExtract.ToPlainText(desc),
|
||||
Parser = "linkedin",
|
||||
Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(desc),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
using System;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport.Plugins;
|
||||
|
||||
public sealed class NavPlugin : IJobSitePlugin
|
||||
{
|
||||
public bool CanHandle(string url)
|
||||
=> url.Contains("arbeidsplassen.nav.no", StringComparison.OrdinalIgnoreCase) ||
|
||||
url.Contains("nav.no", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
public JobImportResult Parse(string html, string url)
|
||||
{
|
||||
var meta = HtmlExtract.ReadMeta(html);
|
||||
var title = meta.TryGetValue("og:title", out var t) ? t : HtmlExtract.ReadTitle(html);
|
||||
var desc = meta.TryGetValue("og:description", out var d) ? d : null;
|
||||
var siteName = meta.TryGetValue("og:site_name", out var sn) ? sn : null;
|
||||
|
||||
return new JobImportResult
|
||||
{
|
||||
SourceUrl = url,
|
||||
Title = title,
|
||||
Company = siteName, // better than nothing; universal parser often gets this anyway.
|
||||
Description = HtmlExtract.ToPlainText(desc),
|
||||
Parser = "nav",
|
||||
Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(desc),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport;
|
||||
|
||||
public static class SkillTagger
|
||||
{
|
||||
private static readonly (string Tag, Regex Pattern)[] Patterns =
|
||||
{
|
||||
("C#", new Regex(@"\bC#\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
(".NET", new Regex(@"\b\.NET\b|\bASP\.NET\b|\bDOTNET\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("Python", new Regex(@"\bPython\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("Docker", new Regex(@"\bDocker\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("Azure", new Regex(@"\bAzure\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("AWS", new Regex(@"\bAWS\b|\bAmazon Web Services\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("React", new Regex(@"\bReact\b|\bReact\.js\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("TypeScript", new Regex(@"\bTypeScript\b|\bTS\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("SQL", new Regex(@"\bSQL\b|\bPostgreSQL\b|\bMySQL\b|\bSQLite\b|\bMS\s*SQL\b|\bT-?SQL\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
("Kubernetes", new Regex(@"\bKubernetes\b|\bK8s\b", RegexOptions.IgnoreCase | RegexOptions.Compiled)),
|
||||
};
|
||||
|
||||
public static string[] Detect(string? description)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(description)) return Array.Empty<string>();
|
||||
var tags = new List<string>(capacity: 8);
|
||||
foreach (var (tag, pattern) in Patterns)
|
||||
{
|
||||
if (pattern.IsMatch(description)) tags.Add(tag);
|
||||
}
|
||||
return tags.Distinct(StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport.Translation;
|
||||
|
||||
public interface ITranslationService
|
||||
{
|
||||
Task<string?> TranslateToEnglishAsync(string text, string sourceLanguage, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
using System;
|
||||
using System.Net.Http;
|
||||
using System.Net.Http.Json;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport.Translation;
|
||||
|
||||
public sealed class LibreTranslateService : ITranslationService
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly string _baseUrl;
|
||||
private readonly string? _apiKey;
|
||||
|
||||
public LibreTranslateService(IHttpClientFactory httpClientFactory, IConfiguration cfg)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_baseUrl = (cfg["Translation:LibreTranslate:BaseUrl"] ?? "").Trim().TrimEnd('/');
|
||||
_apiKey = string.IsNullOrWhiteSpace(cfg["Translation:LibreTranslate:ApiKey"]) ? null : cfg["Translation:LibreTranslate:ApiKey"]!.Trim();
|
||||
}
|
||||
|
||||
public async Task<string?> TranslateToEnglishAsync(string text, string sourceLanguage, CancellationToken cancellationToken)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return null;
|
||||
if (string.IsNullOrWhiteSpace(_baseUrl)) return null;
|
||||
|
||||
using var client = _httpClientFactory.CreateClient();
|
||||
using var req = new HttpRequestMessage(HttpMethod.Post, $"{_baseUrl}/translate")
|
||||
{
|
||||
Content = JsonContent.Create(new
|
||||
{
|
||||
q = text,
|
||||
source = sourceLanguage,
|
||||
target = "en",
|
||||
format = "text",
|
||||
api_key = _apiKey
|
||||
})
|
||||
};
|
||||
|
||||
using var res = await client.SendAsync(req, cancellationToken);
|
||||
if (!res.IsSuccessStatusCode) return null;
|
||||
|
||||
var body = await res.Content.ReadFromJsonAsync<LibreTranslateResponse>(cancellationToken: cancellationToken);
|
||||
return string.IsNullOrWhiteSpace(body?.translatedText) ? null : body!.translatedText;
|
||||
}
|
||||
|
||||
private sealed record LibreTranslateResponse(string? translatedText);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport.Translation;
|
||||
|
||||
public sealed class NoOpTranslationService : ITranslationService
|
||||
{
|
||||
public Task<string?> TranslateToEnglishAsync(string text, string sourceLanguage, CancellationToken cancellationToken)
|
||||
=> Task.FromResult<string?>(null);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,268 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Text.Json;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace JobTrackerApi.Services.JobImport;
|
||||
|
||||
public sealed class UniversalJobParser
|
||||
{
|
||||
private static readonly Regex JsonLdScriptRegex =
|
||||
new(@"<script[^>]+type\s*=\s*[""']application/ld\+json[""'][^>]*>(?<json>[\s\S]*?)</script>",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
public JobImportResult Parse(string html, string url)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(html))
|
||||
{
|
||||
return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "Empty HTML." };
|
||||
}
|
||||
|
||||
foreach (Match m in JsonLdScriptRegex.Matches(html))
|
||||
{
|
||||
var json = (m.Groups["json"].Value ?? "").Trim();
|
||||
if (json.Length == 0) continue;
|
||||
|
||||
// Some sites embed multiple JSON objects in one script; try best-effort.
|
||||
var candidates = SplitJsonLdPayload(json);
|
||||
foreach (var c in candidates)
|
||||
{
|
||||
if (!TryParseJobPosting(c, url, out var result)) continue;
|
||||
return result with { Parser = "universal", Success = true };
|
||||
}
|
||||
}
|
||||
|
||||
return new JobImportResult { SourceUrl = url, Success = false, Parser = "universal", Error = "No JobPosting schema found." };
|
||||
}
|
||||
|
||||
private static IEnumerable<string> SplitJsonLdPayload(string raw)
|
||||
{
|
||||
// Many pages have valid JSON; keep it simple. If parsing fails, try trimming common junk.
|
||||
yield return raw;
|
||||
yield return raw.Trim().TrimEnd(';');
|
||||
}
|
||||
|
||||
private static bool TryParseJobPosting(string json, string url, out JobImportResult result)
|
||||
{
|
||||
result = new JobImportResult { SourceUrl = url, Parser = "universal", Success = false };
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(json);
|
||||
var node = FindJobPostingNode(doc.RootElement);
|
||||
if (node is null) return false;
|
||||
|
||||
var job = node.Value;
|
||||
var title = GetString(job, "title");
|
||||
var description = GetString(job, "description");
|
||||
var company = GetString(job, "hiringOrganization", "name")
|
||||
?? GetString(job, "hiringOrganization", "legalName");
|
||||
var location = ExtractLocation(job);
|
||||
var deadline = ParseDateTime(GetString(job, "validThrough"));
|
||||
|
||||
description = HtmlExtract.ToPlainText(description);
|
||||
|
||||
result = new JobImportResult
|
||||
{
|
||||
SourceUrl = url,
|
||||
Title = title,
|
||||
Company = company,
|
||||
Location = location,
|
||||
Description = description,
|
||||
Deadline = deadline,
|
||||
Success = !string.IsNullOrWhiteSpace(title) && !string.IsNullOrWhiteSpace(description),
|
||||
Parser = "universal"
|
||||
};
|
||||
return result.Success;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static JsonElement? FindJobPostingNode(JsonElement root)
|
||||
{
|
||||
// Accept: { "@type":"JobPosting", ... }
|
||||
if (IsJobPosting(root)) return root;
|
||||
|
||||
// Accept: { "@graph":[...]} or arrays.
|
||||
if (root.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
if (root.TryGetProperty("@graph", out var g) && g.ValueKind == JsonValueKind.Array)
|
||||
{
|
||||
foreach (var el in g.EnumerateArray())
|
||||
{
|
||||
var found = FindJobPostingNode(el);
|
||||
if (found is not null) return found;
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var prop in root.EnumerateObject())
|
||||
{
|
||||
var found = FindJobPostingNode(prop.Value);
|
||||
if (found is not null) return found;
|
||||
}
|
||||
}
|
||||
|
||||
if (root.ValueKind == JsonValueKind.Array)
|
||||
{
|
||||
foreach (var el in root.EnumerateArray())
|
||||
{
|
||||
var found = FindJobPostingNode(el);
|
||||
if (found is not null) return found;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static bool IsJobPosting(JsonElement el)
|
||||
{
|
||||
if (el.ValueKind != JsonValueKind.Object) return false;
|
||||
if (!el.TryGetProperty("@type", out var typeEl)) return false;
|
||||
|
||||
if (typeEl.ValueKind == JsonValueKind.String)
|
||||
{
|
||||
return string.Equals(typeEl.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
if (typeEl.ValueKind == JsonValueKind.Array)
|
||||
{
|
||||
foreach (var t in typeEl.EnumerateArray())
|
||||
{
|
||||
if (t.ValueKind == JsonValueKind.String &&
|
||||
string.Equals(t.GetString(), "JobPosting", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static string? GetString(JsonElement el, params string[] path)
|
||||
{
|
||||
var cur = el;
|
||||
for (var i = 0; i < path.Length; i++)
|
||||
{
|
||||
if (cur.ValueKind != JsonValueKind.Object) return null;
|
||||
if (!cur.TryGetProperty(path[i], out var next)) return null;
|
||||
cur = next;
|
||||
}
|
||||
|
||||
return cur.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => cur.GetString(),
|
||||
JsonValueKind.Number => cur.ToString(),
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? ExtractLocation(JsonElement job)
|
||||
{
|
||||
// jobLocation can be object or array; address fields vary.
|
||||
if (!job.TryGetProperty("jobLocation", out var jl)) return null;
|
||||
var addr = FindFirstAddress(jl);
|
||||
if (addr is null) return null;
|
||||
|
||||
var city = GetString(addr.Value, "addressLocality");
|
||||
var region = GetString(addr.Value, "addressRegion");
|
||||
var country = GetString(addr.Value, "addressCountry");
|
||||
|
||||
var parts = new[] { city, region, country }.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
|
||||
return parts.Length == 0 ? null : string.Join(", ", parts);
|
||||
}
|
||||
|
||||
private static JsonElement? FindFirstAddress(JsonElement jobLocation)
|
||||
{
|
||||
if (jobLocation.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
if (jobLocation.TryGetProperty("address", out var a))
|
||||
{
|
||||
if (a.ValueKind == JsonValueKind.Object) return a;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (jobLocation.ValueKind == JsonValueKind.Array)
|
||||
{
|
||||
foreach (var el in jobLocation.EnumerateArray())
|
||||
{
|
||||
var addr = FindFirstAddress(el);
|
||||
if (addr is not null) return addr;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static DateTime? ParseDateTime(string? raw)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(raw)) return null;
|
||||
if (DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal, out var dt)) return dt;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
internal static class HtmlExtract
|
||||
{
|
||||
private static readonly Regex TitleRegex =
|
||||
new(@"<title[^>]*>(?<t>[\s\S]*?)</title>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
private static readonly Regex TagRegex =
|
||||
new(@"<[^>]+>", RegexOptions.Compiled);
|
||||
|
||||
private static readonly Regex WsRegex =
|
||||
new(@"\s+", RegexOptions.Compiled);
|
||||
|
||||
public static string? ReadTitle(string html)
|
||||
{
|
||||
var m = TitleRegex.Match(html);
|
||||
if (!m.Success) return null;
|
||||
return DecodeHtmlEntities(m.Groups["t"].Value).Trim();
|
||||
}
|
||||
|
||||
public static Dictionary<string, string> ReadMeta(string html)
|
||||
{
|
||||
// Very small meta extractor: picks up OpenGraph + standard meta tags.
|
||||
var dict = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match m in Regex.Matches(html, @"<meta\s+[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled))
|
||||
{
|
||||
var tag = m.Value;
|
||||
var key = GetAttr(tag, "property") ?? GetAttr(tag, "name");
|
||||
var content = GetAttr(tag, "content");
|
||||
if (string.IsNullOrWhiteSpace(key) || string.IsNullOrWhiteSpace(content)) continue;
|
||||
if (!dict.ContainsKey(key)) dict[key] = DecodeHtmlEntities(content).Trim();
|
||||
}
|
||||
|
||||
return dict;
|
||||
}
|
||||
|
||||
public static string? ToPlainText(string? htmlOrText)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(htmlOrText)) return null;
|
||||
var s = DecodeHtmlEntities(htmlOrText);
|
||||
s = TagRegex.Replace(s, " ");
|
||||
s = WsRegex.Replace(s, " ").Trim();
|
||||
return s.Length == 0 ? null : s;
|
||||
}
|
||||
|
||||
private static string? GetAttr(string tag, string attr)
|
||||
{
|
||||
var m = Regex.Match(tag, attr + @"\s*=\s*(?<q>[""'])(?<v>[\s\S]*?)(\k<q>)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
if (m.Success) return m.Groups["v"].Value;
|
||||
|
||||
// Unquoted attribute values.
|
||||
m = Regex.Match(tag, attr + @"\s*=\s*(?<v>[^\s>]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
return m.Success ? m.Groups["v"].Value : null;
|
||||
}
|
||||
|
||||
private static string DecodeHtmlEntities(string s)
|
||||
=> System.Net.WebUtility.HtmlDecode(s);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user