using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Net.Http; using System.Threading; using System.Threading.Tasks; using JobTrackerApi.Services.JobImport.Translation; namespace JobTrackerApi.Services.JobImport; public sealed class JobImportService { private readonly IHttpClientFactory _httpClientFactory; private readonly UniversalJobParser _universal; private readonly IEnumerable _plugins; private readonly ITranslationService _translation; private readonly IHostAddressResolver _hostAddressResolver; public JobImportService( IHttpClientFactory httpClientFactory, UniversalJobParser universal, IEnumerable plugins, ITranslationService translation, IHostAddressResolver hostAddressResolver) { _httpClientFactory = httpClientFactory; _universal = universal; _plugins = plugins; _translation = translation; _hostAddressResolver = hostAddressResolver; } public async Task PreviewAsync(string url, CancellationToken cancellationToken) { var validation = await ValidateUrlAsync(url, cancellationToken); if (!validation.Allowed) { return new JobImportResult { SourceUrl = url ?? "", Success = false, Parser = "none", Error = validation.Error }; } var normalized = validation.Normalized; var html = await FetchHtmlAsync(normalized, cancellationToken); if (html is null) { return new JobImportResult { SourceUrl = normalized, Success = false, Parser = "fetch", Error = "Failed to fetch HTML." }; } var parsed = _universal.Parse(html, normalized); if (!parsed.Success) { foreach (var plugin in _plugins.Where(p => p.CanHandle(normalized))) { try { var p = plugin.Parse(html, normalized); if (p.Success) { parsed = p; break; } parsed = p; // keep last failure for debugging } catch (Exception ex) { parsed = new JobImportResult { SourceUrl = normalized, Success = false, Parser = plugin.GetType().Name, Error = ex.Message }; } } } if (!parsed.Success) return parsed with { SourceUrl = normalized }; var lang = LanguageDetector.Detect(parsed.Description); var tags = SkillTagger.Detect(parsed.Description); string? translated = null; if (string.Equals(lang, "no", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(parsed.Description)) { translated = await _translation.TranslateToEnglishAsync(parsed.Description!, "no", cancellationToken); } return parsed with { SourceUrl = normalized, Language = lang, Tags = tags, TranslatedDescription = translated }; } private async Task FetchHtmlAsync(string url, CancellationToken cancellationToken) { using var client = _httpClientFactory.CreateClient("jobimport"); using var req = new HttpRequestMessage(HttpMethod.Get, url); req.Headers.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) JobTracker/1.0"); req.Headers.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); req.Headers.TryAddWithoutValidation("Accept-Language", "en-US,en;q=0.8,no;q=0.6,nb;q=0.6"); using var res = await client.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, cancellationToken); if ((int)res.StatusCode >= 300 && (int)res.StatusCode < 400) return null; // avoid redirect chains to non-html. if (!res.IsSuccessStatusCode) return null; var ct = res.Content.Headers.ContentType?.MediaType ?? ""; if (ct.Length > 0 && !ct.Contains("html", StringComparison.OrdinalIgnoreCase) && !ct.Contains("xml", StringComparison.OrdinalIgnoreCase)) { // Still read: many sites omit content-type. Best-effort. } // Cap to avoid huge downloads. var bytes = await res.Content.ReadAsByteArrayAsync(cancellationToken); if (bytes.Length > 4_000_000) return null; return System.Text.Encoding.UTF8.GetString(bytes); } private async Task ValidateUrlAsync(string? url, CancellationToken cancellationToken) { if (string.IsNullOrWhiteSpace(url)) { return UrlValidationResult.Reject("URL is required."); } if (!Uri.TryCreate(url.Trim(), UriKind.Absolute, out var uri)) { return UrlValidationResult.Reject("Invalid URL."); } if (uri.Scheme is not ("http" or "https")) { return UrlValidationResult.Reject("Only http/https URLs are supported."); } if (uri.IsLoopback || string.Equals(uri.Host, "localhost", StringComparison.OrdinalIgnoreCase)) { return UrlValidationResult.Reject("Local or private network URLs are not allowed."); } if (IPAddress.TryParse(uri.Host, out var ip)) { if (IsBlockedAddress(ip)) { return UrlValidationResult.Reject("Local or private network URLs are not allowed."); } return UrlValidationResult.Allow(uri.ToString()); } IPAddress[] addresses; try { addresses = await _hostAddressResolver.ResolveAsync(uri.Host, cancellationToken); } catch { return UrlValidationResult.Reject("Host resolution failed."); } if (addresses.Length == 0 || addresses.Any(IsBlockedAddress)) { return UrlValidationResult.Reject("Local or private network URLs are not allowed."); } return UrlValidationResult.Allow(uri.ToString()); } private static bool IsBlockedAddress(IPAddress ip) { if (IPAddress.IsLoopback(ip)) return true; if (ip.Equals(IPAddress.Any) || ip.Equals(IPAddress.IPv6Any)) return true; if (ip.Equals(IPAddress.None) || ip.Equals(IPAddress.IPv6None)) return true; if (ip.IsIPv6LinkLocal || ip.IsIPv6SiteLocal || ip.IsIPv6Multicast || ip.IsIPv6Teredo) return true; if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetwork) { var b = ip.GetAddressBytes(); return b[0] == 10 || b[0] == 0 || b[0] == 127 || (b[0] == 100 && b[1] >= 64 && b[1] <= 127) || (b[0] == 169 && b[1] == 254) || (b[0] == 172 && b[1] >= 16 && b[1] <= 31) || (b[0] == 192 && b[1] == 168) || (b[0] == 198 && (b[1] == 18 || b[1] == 19)); } if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetworkV6) { var bytes = ip.GetAddressBytes(); return (bytes[0] & 0xfe) == 0xfc; // fc00::/7 unique local addresses } return false; } private sealed record UrlValidationResult(bool Allowed, string Normalized, string Error) { public static UrlValidationResult Allow(string normalized) => new(true, normalized, string.Empty); public static UrlValidationResult Reject(string error) => new(false, string.Empty, error); } }