Evolve summarizer into AI service with OCR support

This commit is contained in:
cesnimda
2026-03-23 20:12:34 +01:00
parent 90fdd8e1a5
commit 653f713a78
20 changed files with 475 additions and 129 deletions
@@ -44,7 +44,7 @@ public sealed class AdminSystemController : ControllerBase
DatabaseStatusDto Database,
RuntimeStatusDto Runtime,
AuthStatusDto Auth,
SummarizerMetrics Summarizer
AiServiceMetrics Ai
);
private static string? NormalizeBuildMetadata(string? value)
@@ -62,6 +62,7 @@ public sealed class AdminSystemController : ControllerBase
return trimmed;
}
[HttpPost("ai/probe")]
[HttpPost("summarizer/probe")]
public async Task<IActionResult> RunSummarizerProbe(CancellationToken cancellationToken)
{
@@ -79,7 +80,7 @@ public sealed class AdminSystemController : ControllerBase
var jobs = await _db.JobApplications.AsNoTracking().ToListAsync(cancellationToken);
var companies = await _db.Companies.AsNoTracking().CountAsync(cancellationToken);
var summarizer = await _summarizer.GetMetricsAsync(cancellationToken);
var ai = await _summarizer.GetMetricsAsync(cancellationToken);
var version = NormalizeBuildMetadata(_cfg["App:Version"]);
if (string.IsNullOrWhiteSpace(version))
@@ -180,7 +181,7 @@ public sealed class AdminSystemController : ControllerBase
GoogleConfigured: !string.IsNullOrWhiteSpace((_cfg["Auth:GoogleClientId"] ?? string.Empty).Trim()),
GmailConfigured: gmailConfigured
),
Summarizer: summarizer
Ai: ai
));
}
}
@@ -1838,8 +1838,9 @@ Candidate master CV:
return NoContent();
}
[HttpGet("ai-metrics")]
[HttpGet("summarizer-metrics")]
public async Task<ActionResult<SummarizerMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken)
public async Task<ActionResult<AiServiceMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken)
{
var metrics = await _summarizer.GetMetricsAsync(cancellationToken);
return Ok(metrics);
@@ -1,5 +1,6 @@
using System.Text;
using System.Text.RegularExpressions;
using JobTrackerApi.Services;
using JobTrackerApi.Models;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Identity;
@@ -18,15 +19,21 @@ public sealed class ProfileCvController : ControllerBase
".md",
".pdf",
".docx",
".png",
".jpg",
".jpeg",
".webp",
};
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
public ProfileCvController(UserManager<ApplicationUser> users)
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService)
{
_users = users;
_aiService = aiService;
}
[HttpPost("upload")]
@@ -41,10 +48,34 @@ public sealed class ProfileCvController : ControllerBase
var extension = Path.GetExtension(file.FileName ?? string.Empty);
if (!AllowedExtensions.Contains(extension))
{
return BadRequest("Only .txt, .md, .pdf, and .docx CV imports are supported right now.");
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
}
var text = (await ExtractTextAsync(file, extension)).Trim();
string text;
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
if (canUseAiExtraction)
{
await using var uploadStream = file.OpenReadStream();
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted);
text = extracted?.Text?.Trim() ?? string.Empty;
}
else
{
text = string.Empty;
}
if (string.IsNullOrWhiteSpace(text))
{
text = (await ExtractTextAsync(file, extension)).Trim();
}
if (string.IsNullOrWhiteSpace(text))
{
return BadRequest("The uploaded CV file could not be read or was empty.");
+5 -3
View File
@@ -116,10 +116,12 @@ builder.Services.AddHttpClient("jobimport")
AutomaticDecompression = DecompressionMethods.All
});
// Local summarizer service (FastAPI). Default URL can be overridden via configuration `Summarizer:BaseUrl`.
builder.Services.AddHttpClient("summarizer", client =>
// Local AI service (FastAPI). Supports summarization and OCR/text extraction.
builder.Services.AddHttpClient("ai-service", client =>
{
var baseUrl = builder.Configuration["Summarizer:BaseUrl"] ?? "http://127.0.0.1:8001";
var baseUrl = builder.Configuration["Ai:BaseUrl"]
?? builder.Configuration["Summarizer:BaseUrl"]
?? "http://127.0.0.1:8001";
client.BaseAddress = new Uri(baseUrl);
client.Timeout = TimeSpan.FromSeconds(30);
});
+130 -45
View File
@@ -13,12 +13,14 @@ using Microsoft.Extensions.Logging;
namespace JobTrackerApi.Services
{
public sealed record SummarizerMetrics(
public sealed record AiServiceMetrics(
bool Healthy,
string? Model,
string? Device,
bool? GpuAvailable,
string? GpuName,
bool? OcrAvailable,
string? OcrLanguages,
double? HealthLatencyMs,
double? ProbeLatencyMs,
DateTimeOffset? LastProbeAt,
@@ -30,17 +32,36 @@ namespace JobTrackerApi.Services
int CacheMisses,
int Failures,
double? AverageLatencyMs,
int OcrRequests,
int OcrFailures,
double? AverageOcrLatencyMs,
DateTimeOffset? LastOcrSuccessAt,
DateTimeOffset? LastOcrFailureAt,
DateTimeOffset? LastSuccessAt,
DateTimeOffset? LastFailureAt,
string? LastError
);
public interface ISummarizerService
public sealed record AiTextExtractionResult(
string? Text,
bool OcrUsed,
string? ContentType,
int? PageCount,
int Characters,
string? FileName
);
public interface IAiService
{
Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30);
Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40);
Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default);
Task RunProbeAsync(CancellationToken cancellationToken = default);
Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default);
Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default);
}
public interface ISummarizerService : IAiService
{
}
public class SummarizerService : ISummarizerService
@@ -60,6 +81,11 @@ namespace JobTrackerApi.Services
private DateTimeOffset? _lastProbeSuccessAt;
private DateTimeOffset? _lastProbeFailureAt;
private int _probeFailures;
private int _ocrRequests;
private int _ocrFailures;
private long _totalOcrLatencyTicks;
private DateTimeOffset? _lastOcrSuccessAt;
private DateTimeOffset? _lastOcrFailureAt;
private string? _lastError;
public SummarizerService(IHttpClientFactory httpFactory, IMemoryCache cache)
@@ -78,22 +104,18 @@ namespace JobTrackerApi.Services
public async Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30)
{
if (string.IsNullOrWhiteSpace(text)) return null;
return await SummarizeCoreAsync(text, maxLength, minLength);
}
public Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40)
{
if (string.IsNullOrWhiteSpace(instruction) || string.IsNullOrWhiteSpace(text)) return Task.FromResult<string?>(null);
var composed = $"{instruction.Trim()}\n\n{text.Trim()}";
return SummarizeCoreAsync(composed, maxLength, minLength);
}
private async Task<string?> SummarizeCoreAsync(string text, int maxLength, int minLength)
{
// Use a deterministic content hash instead of string.GetHashCode() so cache keys
// are collision-resistant and stable across process restarts.
var key = BuildCacheKey(text, maxLength, minLength);
Interlocked.Increment(ref _requests);
@@ -110,7 +132,7 @@ namespace JobTrackerApi.Services
Interlocked.Increment(ref _cacheMisses);
var client = _httpFactory.CreateClient("summarizer");
var client = _httpFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { text, max_length = maxLength, min_length = minLength });
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
var sw = Stopwatch.StartNew();
@@ -152,10 +174,74 @@ namespace JobTrackerApi.Services
}
}
public async Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(stream);
if (string.IsNullOrWhiteSpace(fileName)) fileName = "document";
Interlocked.Increment(ref _ocrRequests);
var client = _httpFactory.CreateClient("ai-service");
var sw = Stopwatch.StartNew();
try
{
using var form = new MultipartFormDataContent();
using var fileContent = new StreamContent(stream);
if (!string.IsNullOrWhiteSpace(contentType))
{
fileContent.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue(contentType);
}
form.Add(fileContent, "file", fileName);
using var response = await client.PostAsync("/extract-text", form, cancellationToken);
sw.Stop();
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
if (!response.IsSuccessStatusCode)
{
Interlocked.Increment(ref _ocrFailures);
lock (_metricsLock)
{
_lastOcrFailureAt = DateTimeOffset.UtcNow;
_lastError = $"AI extraction returned {(int)response.StatusCode}.";
}
return null;
}
await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken);
using var doc = await JsonDocument.ParseAsync(responseStream, cancellationToken: cancellationToken);
var text = doc.RootElement.TryGetProperty("text", out var textEl) ? textEl.GetString() : null;
var ocrUsed = doc.RootElement.TryGetProperty("ocr_used", out var ocrEl) && ocrEl.ValueKind is JsonValueKind.True or JsonValueKind.False && ocrEl.GetBoolean();
var detectedContentType = doc.RootElement.TryGetProperty("content_type", out var contentTypeEl) ? contentTypeEl.GetString() : contentType;
int? pageCount = doc.RootElement.TryGetProperty("page_count", out var pageCountEl) && pageCountEl.ValueKind == JsonValueKind.Number ? pageCountEl.GetInt32() : null;
var characters = doc.RootElement.TryGetProperty("characters", out var charactersEl) && charactersEl.ValueKind == JsonValueKind.Number ? charactersEl.GetInt32() : (text?.Length ?? 0);
var returnedFileName = doc.RootElement.TryGetProperty("file_name", out var fileNameEl) ? fileNameEl.GetString() : fileName;
lock (_metricsLock)
{
_lastOcrSuccessAt = DateTimeOffset.UtcNow;
_lastError = null;
}
return new AiTextExtractionResult(text, ocrUsed, detectedContentType, pageCount, characters, returnedFileName);
}
catch (Exception ex)
{
sw.Stop();
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
Interlocked.Increment(ref _ocrFailures);
lock (_metricsLock)
{
_lastOcrFailureAt = DateTimeOffset.UtcNow;
_lastError = ex.Message;
}
return null;
}
}
public async Task RunProbeAsync(CancellationToken cancellationToken = default)
{
const string probeText = "Summarizer latency probe for job tracker telemetry.";
var client = _httpFactory.CreateClient("summarizer");
const string probeText = "AI service latency probe for Jobbjakt telemetry.";
var client = _httpFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { text = probeText, max_length = 48, min_length = 12 });
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
var sw = Stopwatch.StartNew();
@@ -215,13 +301,15 @@ namespace JobTrackerApi.Services
}
}
public async Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default)
public async Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default)
{
var client = _httpFactory.CreateClient("summarizer");
var client = _httpFactory.CreateClient("ai-service");
string? model = null;
string? device = null;
bool? gpuAvailable = null;
string? gpuName = null;
bool? ocrAvailable = null;
string? ocrLanguages = null;
double? healthLatencyMs = null;
var healthy = false;
string? healthError = null;
@@ -238,25 +326,12 @@ namespace JobTrackerApi.Services
{
using var stream = await res.Content.ReadAsStreamAsync(cancellationToken);
using var doc = await JsonDocument.ParseAsync(stream, cancellationToken: cancellationToken);
if (doc.RootElement.TryGetProperty("model", out var modelEl))
{
model = modelEl.GetString();
}
if (doc.RootElement.TryGetProperty("device", out var deviceEl))
{
device = deviceEl.GetString();
}
if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False)
{
gpuAvailable = gpuAvailableEl.GetBoolean();
}
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl))
{
gpuName = gpuNameEl.GetString();
}
if (doc.RootElement.TryGetProperty("model", out var modelEl)) model = modelEl.GetString();
if (doc.RootElement.TryGetProperty("device", out var deviceEl)) device = deviceEl.GetString();
if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) gpuAvailable = gpuAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
}
else
{
@@ -273,6 +348,9 @@ namespace JobTrackerApi.Services
var cacheMisses = Volatile.Read(ref _cacheMisses);
var failures = Volatile.Read(ref _failures);
var totalLatencyTicks = Volatile.Read(ref _totalLatencyTicks);
var ocrRequests = Volatile.Read(ref _ocrRequests);
var ocrFailures = Volatile.Read(ref _ocrFailures);
var totalOcrLatencyTicks = Volatile.Read(ref _totalOcrLatencyTicks);
DateTimeOffset? lastSuccessAt;
DateTimeOffset? lastFailureAt;
@@ -280,6 +358,8 @@ namespace JobTrackerApi.Services
DateTimeOffset? lastProbeAt;
DateTimeOffset? lastProbeSuccessAt;
DateTimeOffset? lastProbeFailureAt;
DateTimeOffset? lastOcrSuccessAt;
DateTimeOffset? lastOcrFailureAt;
string? lastError;
lock (_metricsLock)
{
@@ -289,6 +369,8 @@ namespace JobTrackerApi.Services
lastProbeAt = _lastProbeAt;
lastProbeSuccessAt = _lastProbeSuccessAt;
lastProbeFailureAt = _lastProbeFailureAt;
lastOcrSuccessAt = _lastOcrSuccessAt;
lastOcrFailureAt = _lastOcrFailureAt;
lastError = _lastError;
}
@@ -297,16 +379,17 @@ namespace JobTrackerApi.Services
lastError = healthError;
}
double? averageLatencyMs = requests > 0
? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1)
: null;
double? averageLatencyMs = requests > 0 ? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1) : null;
double? averageOcrLatencyMs = ocrRequests > 0 ? Math.Round(TimeSpan.FromTicks(totalOcrLatencyTicks).TotalMilliseconds / ocrRequests, 1) : null;
return new SummarizerMetrics(
return new AiServiceMetrics(
Healthy: healthy,
Model: model,
Device: device,
GpuAvailable: gpuAvailable,
GpuName: gpuName,
OcrAvailable: ocrAvailable,
OcrLanguages: ocrLanguages,
HealthLatencyMs: healthLatencyMs,
ProbeLatencyMs: probeLatencyMs,
LastProbeAt: lastProbeAt,
@@ -318,6 +401,11 @@ namespace JobTrackerApi.Services
CacheMisses: cacheMisses,
Failures: failures,
AverageLatencyMs: averageLatencyMs,
OcrRequests: ocrRequests,
OcrFailures: ocrFailures,
AverageOcrLatencyMs: averageOcrLatencyMs,
LastOcrSuccessAt: lastOcrSuccessAt,
LastOcrFailureAt: lastOcrFailureAt,
LastSuccessAt: lastSuccessAt,
LastFailureAt: lastFailureAt,
LastError: lastError
@@ -340,14 +428,11 @@ namespace JobTrackerApi.Services
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var enabled = _cfg.GetValue("Summarizer:ProbeEnabled", true);
if (!enabled)
{
return;
}
var enabled = _cfg.GetValue("Ai:ProbeEnabled", _cfg.GetValue("Summarizer:ProbeEnabled", true));
if (!enabled) return;
var intervalSeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300), 30, 3600);
var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15), 0, 600);
var intervalSeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeIntervalSeconds", _cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300)), 30, 3600);
var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeInitialDelaySeconds", _cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15)), 0, 600);
if (initialDelaySeconds > 0)
{
@@ -360,8 +445,8 @@ namespace JobTrackerApi.Services
try
{
using var scope = _scopeFactory.CreateScope();
var summarizer = scope.ServiceProvider.GetRequiredService<ISummarizerService>();
await summarizer.RunProbeAsync(stoppingToken);
var aiService = scope.ServiceProvider.GetRequiredService<ISummarizerService>();
await aiService.RunProbeAsync(stoppingToken);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
@@ -369,7 +454,7 @@ namespace JobTrackerApi.Services
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Summarizer latency probe failed.");
_logger.LogWarning(ex, "AI service latency probe failed.");
}
}
while (await timer.WaitForNextTickAsync(stoppingToken));