First Commit

This commit is contained in:
cesnimda
2026-03-21 11:55:27 +01:00
commit 2e8a29b4d0
1757 changed files with 166084 additions and 0 deletions
@@ -0,0 +1,185 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
using JobTrackerApi.Services.JobImport.Translation;
namespace JobTrackerApi.Services.JobImport;
public sealed class JobImportService
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly UniversalJobParser _universal;
private readonly IEnumerable<IJobSitePlugin> _plugins;
private readonly ITranslationService _translation;
public JobImportService(
IHttpClientFactory httpClientFactory,
UniversalJobParser universal,
IEnumerable<IJobSitePlugin> plugins,
ITranslationService translation)
{
_httpClientFactory = httpClientFactory;
_universal = universal;
_plugins = plugins;
_translation = translation;
}
public async Task<JobImportResult> PreviewAsync(string url, CancellationToken cancellationToken)
{
if (!TryValidateUrl(url, out var normalized, out var error))
{
return new JobImportResult
{
SourceUrl = url ?? "",
Success = false,
Parser = "none",
Error = error
};
}
var html = await FetchHtmlAsync(normalized, cancellationToken);
if (html is null)
{
return new JobImportResult
{
SourceUrl = normalized,
Success = false,
Parser = "fetch",
Error = "Failed to fetch HTML."
};
}
var parsed = _universal.Parse(html, normalized);
if (!parsed.Success)
{
foreach (var plugin in _plugins.Where(p => p.CanHandle(normalized)))
{
try
{
var p = plugin.Parse(html, normalized);
if (p.Success)
{
parsed = p;
break;
}
parsed = p; // keep last failure for debugging
}
catch (Exception ex)
{
parsed = new JobImportResult
{
SourceUrl = normalized,
Success = false,
Parser = plugin.GetType().Name,
Error = ex.Message
};
}
}
}
if (!parsed.Success) return parsed with { SourceUrl = normalized };
var lang = LanguageDetector.Detect(parsed.Description);
var tags = SkillTagger.Detect(parsed.Description);
string? translated = null;
if (string.Equals(lang, "no", StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(parsed.Description))
{
translated = await _translation.TranslateToEnglishAsync(parsed.Description!, "no", cancellationToken);
}
return parsed with
{
SourceUrl = normalized,
Language = lang,
Tags = tags,
TranslatedDescription = translated
};
}
private async Task<string?> FetchHtmlAsync(string url, CancellationToken cancellationToken)
{
using var client = _httpClientFactory.CreateClient("jobimport");
using var req = new HttpRequestMessage(HttpMethod.Get, url);
req.Headers.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) JobTracker/1.0");
req.Headers.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
req.Headers.TryAddWithoutValidation("Accept-Language", "en-US,en;q=0.8,no;q=0.6,nb;q=0.6");
using var res = await client.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
if ((int)res.StatusCode >= 300 && (int)res.StatusCode < 400) return null; // avoid redirect chains to non-html.
if (!res.IsSuccessStatusCode) return null;
var ct = res.Content.Headers.ContentType?.MediaType ?? "";
if (ct.Length > 0 && !ct.Contains("html", StringComparison.OrdinalIgnoreCase) && !ct.Contains("xml", StringComparison.OrdinalIgnoreCase))
{
// Still read: many sites omit content-type. Best-effort.
}
// Cap to avoid huge downloads.
var bytes = await res.Content.ReadAsByteArrayAsync(cancellationToken);
if (bytes.Length > 4_000_000) return null;
return System.Text.Encoding.UTF8.GetString(bytes);
}
private static bool TryValidateUrl(string? url, out string normalized, out string error)
{
normalized = "";
error = "";
if (string.IsNullOrWhiteSpace(url))
{
error = "URL is required.";
return false;
}
if (!Uri.TryCreate(url.Trim(), UriKind.Absolute, out var uri))
{
error = "Invalid URL.";
return false;
}
if (uri.Scheme is not ("http" or "https"))
{
error = "Only http/https URLs are supported.";
return false;
}
if (uri.IsLoopback || string.Equals(uri.Host, "localhost", StringComparison.OrdinalIgnoreCase))
{
error = "Local URLs are not allowed.";
return false;
}
// Block literal private IPs.
if (IPAddress.TryParse(uri.Host, out var ip))
{
if (IsPrivateIp(ip))
{
error = "Private IP URLs are not allowed.";
return false;
}
}
normalized = uri.ToString();
return true;
}
private static bool IsPrivateIp(IPAddress ip)
{
if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetwork)
{
var b = ip.GetAddressBytes();
return b[0] == 10 ||
(b[0] == 172 && b[1] >= 16 && b[1] <= 31) ||
(b[0] == 192 && b[1] == 168) ||
(b[0] == 169 && b[1] == 254);
}
if (ip.AddressFamily == System.Net.Sockets.AddressFamily.InterNetworkV6)
{
return ip.IsIPv6LinkLocal || ip.IsIPv6SiteLocal;
}
return false;
}
}