218 lines
7.8 KiB
C#
218 lines
7.8 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Net;
|
|
using System.Net.Http;
|
|
using System.Threading;
|
|
using System.Threading.Tasks;
|
|
using JobTrackerApi.Services.JobImport.Translation;
|
|
|
|
namespace JobTrackerApi.Services.JobImport;
|
|
|
|
public sealed class JobImportService
|
|
{
|
|
private readonly IHttpClientFactory _httpClientFactory;
|
|
private readonly UniversalJobParser _universal;
|
|
private readonly IEnumerable<IJobSitePlugin> _plugins;
|
|
private readonly ITranslationService _translation;
|
|
private readonly IHostAddressResolver _hostAddressResolver;
|
|
|
|
public JobImportService(
|
|
IHttpClientFactory httpClientFactory,
|
|
UniversalJobParser universal,
|
|
IEnumerable<IJobSitePlugin> plugins,
|
|
ITranslationService translation,
|
|
IHostAddressResolver hostAddressResolver)
|
|
{
|
|
_httpClientFactory = httpClientFactory;
|
|
_universal = universal;
|
|
_plugins = plugins;
|
|
_translation = translation;
|
|
_hostAddressResolver = hostAddressResolver;
|
|
}
|
|
|
|
public async Task<JobImportResult> PreviewAsync(string url, CancellationToken cancellationToken)
|
|
{
|
|
var validation = await ValidateUrlAsync(url, cancellationToken);
|
|
if (!validation.Allowed)
|
|
{
|
|
return new JobImportResult
|
|
{
|
|
SourceUrl = url ?? "",
|
|
Success = false,
|
|
Parser = "none",
|
|
Error = validation.Error
|
|
};
|
|
}
|
|
|
|
var normalized = validation.Normalized;
|
|
|
|
var html = await FetchHtmlAsync(normalized, cancellationToken);
|
|
if (html is null)
|
|
{
|
|
return new JobImportResult
|
|
{
|
|
SourceUrl = normalized,
|
|
Success = false,
|
|
Parser = "fetch",
|
|
Error = "Failed to fetch HTML."
|
|
};
|
|
}
|
|
|
|
var parsed = _universal.Parse(html, normalized);
|
|
if (!parsed.Success)
|
|
{
|
|
foreach (var plugin in _plugins.Where(p => p.CanHandle(normalized)))
|
|
{
|
|
try
|
|
{
|
|
var p = plugin.Parse(html, normalized);
|
|
if (p.Success)
|
|
{
|
|
parsed = p;
|
|
break;
|
|
}
|
|
parsed = p; // keep last failure for debugging
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
parsed = new JobImportResult
|
|
{
|
|
SourceUrl = normalized,
|
|
Success = false,
|
|
Parser = plugin.GetType().Name,
|
|
Error = ex.Message
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!parsed.Success) return parsed with { SourceUrl = normalized };
|
|
|
|
var lang = LanguageDetector.Detect(parsed.Description);
|
|
var tags = SkillTagger.Detect(parsed.Description);
|
|
string? translated = null;
|
|
if (string.Equals(lang, "no", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(parsed.Description))
|
|
{
|
|
translated = await _translation.TranslateToEnglishAsync(parsed.Description!, "no", cancellationToken);
|
|
}
|
|
|
|
return parsed with
|
|
{
|
|
SourceUrl = normalized,
|
|
Language = lang,
|
|
Tags = tags,
|
|
TranslatedDescription = translated
|
|
};
|
|
}
|
|
|
|
private async Task<string?> FetchHtmlAsync(string url, CancellationToken cancellationToken)
|
|
{
|
|
using var client = _httpClientFactory.CreateClient("jobimport");
|
|
using var req = new HttpRequestMessage(HttpMethod.Get, url);
|
|
req.Headers.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) JobTracker/1.0");
|
|
req.Headers.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
|
req.Headers.TryAddWithoutValidation("Accept-Language", "en-US,en;q=0.8,no;q=0.6,nb;q=0.6");
|
|
|
|
using var res = await client.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
|
|
if ((int)res.StatusCode >= 300 && (int)res.StatusCode < 400) return null; // avoid redirect chains to non-html.
|
|
if (!res.IsSuccessStatusCode) return null;
|
|
|
|
var ct = res.Content.Headers.ContentType?.MediaType ?? "";
|
|
if (ct.Length > 0 && !ct.Contains("html", StringComparison.OrdinalIgnoreCase) && !ct.Contains("xml", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
// Still read: many sites omit content-type. Best-effort.
|
|
}
|
|
|
|
// Cap to avoid huge downloads.
|
|
var bytes = await res.Content.ReadAsByteArrayAsync(cancellationToken);
|
|
if (bytes.Length > 4_000_000) return null;
|
|
return System.Text.Encoding.UTF8.GetString(bytes);
|
|
}
|
|
|
|
private async Task<UrlValidationResult> ValidateUrlAsync(string? url, CancellationToken cancellationToken)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(url))
|
|
{
|
|
return UrlValidationResult.Reject("URL is required.");
|
|
}
|
|
|
|
if (!Uri.TryCreate(url.Trim(), UriKind.Absolute, out var uri))
|
|
{
|
|
return UrlValidationResult.Reject("Invalid URL.");
|
|
}
|
|
|
|
if (uri.Scheme is not ("http" or "https"))
|
|
{
|
|
return UrlValidationResult.Reject("Only http/https URLs are supported.");
|
|
}
|
|
|
|
if (uri.IsLoopback || string.Equals(uri.Host, "localhost", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return UrlValidationResult.Reject("Local or private network URLs are not allowed.");
|
|
}
|
|
|
|
if (IPAddress.TryParse(uri.Host, out var ip))
|
|
{
|
|
if (IsBlockedAddress(ip))
|
|
{
|
|
return UrlValidationResult.Reject("Local or private network URLs are not allowed.");
|
|
}
|
|
|
|
return UrlValidationResult.Allow(uri.ToString());
|
|
}
|
|
|
|
IPAddress[] addresses;
|
|
try
|
|
{
|
|
addresses = await _hostAddressResolver.ResolveAsync(uri.Host, cancellationToken);
|
|
}
|
|
catch
|
|
{
|
|
return UrlValidationResult.Reject("Host resolution failed.");
|
|
}
|
|
|
|
if (addresses.Length == 0 || addresses.Any(IsBlockedAddress))
|
|
{
|
|
return UrlValidationResult.Reject("Local or private network URLs are not allowed.");
|
|
}
|
|
|
|
return UrlValidationResult.Allow(uri.ToString());
|
|
}
|
|
|
|
private static bool IsBlockedAddress(IPAddress ip)
|
|
{
|
|
if (IPAddress.IsLoopback(ip)) return true;
|
|
if (ip.Equals(IPAddress.Any) || ip.Equals(IPAddress.IPv6Any)) return true;
|
|
if (ip.Equals(IPAddress.None) || ip.Equals(IPAddress.IPv6None)) return true;
|
|
if (ip.IsIPv6LinkLocal || ip.IsIPv6SiteLocal || ip.IsIPv6Multicast || ip.IsIPv6Teredo) return true;
|
|
|
|
if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetwork)
|
|
{
|
|
var b = ip.GetAddressBytes();
|
|
return b[0] == 10 ||
|
|
b[0] == 0 ||
|
|
b[0] == 127 ||
|
|
(b[0] == 100 && b[1] >= 64 && b[1] <= 127) ||
|
|
(b[0] == 169 && b[1] == 254) ||
|
|
(b[0] == 172 && b[1] >= 16 && b[1] <= 31) ||
|
|
(b[0] == 192 && b[1] == 168) ||
|
|
(b[0] == 198 && (b[1] == 18 || b[1] == 19));
|
|
}
|
|
|
|
if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetworkV6)
|
|
{
|
|
var bytes = ip.GetAddressBytes();
|
|
return (bytes[0] & 0xfe) == 0xfc; // fc00::/7 unique local addresses
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private sealed record UrlValidationResult(bool Allowed, string Normalized, string Error)
|
|
{
|
|
public static UrlValidationResult Allow(string normalized) => new(true, normalized, string.Empty);
|
|
public static UrlValidationResult Reject(string error) => new(false, string.Empty, error);
|
|
}
|
|
}
|