Evolve summarizer into AI service with OCR support

This commit is contained in:
cesnimda
2026-03-23 20:12:34 +01:00
parent 90fdd8e1a5
commit 653f713a78
20 changed files with 475 additions and 129 deletions
+1 -1
View File
@@ -8,7 +8,7 @@ AUTH_GOOGLE_CLIENT_ID=CHANGE_ME_GOOGLE_CLIENT_ID
GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET
# Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback # Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback
GOOGLE_GMAIL_REDIRECT_URI= GOOGLE_GMAIL_REDIRECT_URI=
SUMMARIZER_BASE_URL=http://summarizer:8001 AI_SERVICE_BASE_URL=http://ai-service:8001
# Optional: only needed if you want the UI to call a non-default API base URL. # Optional: only needed if you want the UI to call a non-default API base URL.
# In production the UI defaults to `/api`. # In production the UI defaults to `/api`.
+2 -2
View File
@@ -37,7 +37,7 @@ jobs:
- name: Test frontend - name: Test frontend
working-directory: job-tracker-ui working-directory: job-tracker-ui
run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx admin-system-page.test.tsx
- name: Build frontend - name: Build frontend
working-directory: job-tracker-ui working-directory: job-tracker-ui
@@ -76,7 +76,7 @@ jobs:
APP_BUILD_STAMP="$(date -u +'%Y-%m-%d %H:%M UTC')" \ APP_BUILD_STAMP="$(date -u +'%Y-%m-%d %H:%M UTC')" \
./deploy/deploy.sh ./deploy/deploy.sh
docker compose ps docker compose ps
docker compose exec -T summarizer python -c "import time, urllib.request; deadline=time.time()+60; last=None docker compose exec -T ai-service python -c "import time, urllib.request; deadline=time.time()+60; last=None
for _ in range(30): for _ in range(30):
try: try:
urllib.request.urlopen('http://127.0.0.1:8001/health', timeout=5).read() urllib.request.urlopen('http://127.0.0.1:8001/health', timeout=5).read()
@@ -44,7 +44,7 @@ public sealed class AdminSystemController : ControllerBase
DatabaseStatusDto Database, DatabaseStatusDto Database,
RuntimeStatusDto Runtime, RuntimeStatusDto Runtime,
AuthStatusDto Auth, AuthStatusDto Auth,
SummarizerMetrics Summarizer AiServiceMetrics Ai
); );
private static string? NormalizeBuildMetadata(string? value) private static string? NormalizeBuildMetadata(string? value)
@@ -62,6 +62,7 @@ public sealed class AdminSystemController : ControllerBase
return trimmed; return trimmed;
} }
[HttpPost("ai/probe")]
[HttpPost("summarizer/probe")] [HttpPost("summarizer/probe")]
public async Task<IActionResult> RunSummarizerProbe(CancellationToken cancellationToken) public async Task<IActionResult> RunSummarizerProbe(CancellationToken cancellationToken)
{ {
@@ -79,7 +80,7 @@ public sealed class AdminSystemController : ControllerBase
var jobs = await _db.JobApplications.AsNoTracking().ToListAsync(cancellationToken); var jobs = await _db.JobApplications.AsNoTracking().ToListAsync(cancellationToken);
var companies = await _db.Companies.AsNoTracking().CountAsync(cancellationToken); var companies = await _db.Companies.AsNoTracking().CountAsync(cancellationToken);
var summarizer = await _summarizer.GetMetricsAsync(cancellationToken); var ai = await _summarizer.GetMetricsAsync(cancellationToken);
var version = NormalizeBuildMetadata(_cfg["App:Version"]); var version = NormalizeBuildMetadata(_cfg["App:Version"]);
if (string.IsNullOrWhiteSpace(version)) if (string.IsNullOrWhiteSpace(version))
@@ -180,7 +181,7 @@ public sealed class AdminSystemController : ControllerBase
GoogleConfigured: !string.IsNullOrWhiteSpace((_cfg["Auth:GoogleClientId"] ?? string.Empty).Trim()), GoogleConfigured: !string.IsNullOrWhiteSpace((_cfg["Auth:GoogleClientId"] ?? string.Empty).Trim()),
GmailConfigured: gmailConfigured GmailConfigured: gmailConfigured
), ),
Summarizer: summarizer Ai: ai
)); ));
} }
} }
@@ -1838,8 +1838,9 @@ Candidate master CV:
return NoContent(); return NoContent();
} }
[HttpGet("ai-metrics")]
[HttpGet("summarizer-metrics")] [HttpGet("summarizer-metrics")]
public async Task<ActionResult<SummarizerMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken) public async Task<ActionResult<AiServiceMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken)
{ {
var metrics = await _summarizer.GetMetricsAsync(cancellationToken); var metrics = await _summarizer.GetMetricsAsync(cancellationToken);
return Ok(metrics); return Ok(metrics);
@@ -1,5 +1,6 @@
using System.Text; using System.Text;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using JobTrackerApi.Services;
using JobTrackerApi.Models; using JobTrackerApi.Models;
using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Identity; using Microsoft.AspNetCore.Identity;
@@ -18,15 +19,21 @@ public sealed class ProfileCvController : ControllerBase
".md", ".md",
".pdf", ".pdf",
".docx", ".docx",
".png",
".jpg",
".jpeg",
".webp",
}; };
private const long MaxFileSizeBytes = 5 * 1024 * 1024; private const long MaxFileSizeBytes = 5 * 1024 * 1024;
private readonly UserManager<ApplicationUser> _users; private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
public ProfileCvController(UserManager<ApplicationUser> users) public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService)
{ {
_users = users; _users = users;
_aiService = aiService;
} }
[HttpPost("upload")] [HttpPost("upload")]
@@ -41,10 +48,34 @@ public sealed class ProfileCvController : ControllerBase
var extension = Path.GetExtension(file.FileName ?? string.Empty); var extension = Path.GetExtension(file.FileName ?? string.Empty);
if (!AllowedExtensions.Contains(extension)) if (!AllowedExtensions.Contains(extension))
{ {
return BadRequest("Only .txt, .md, .pdf, and .docx CV imports are supported right now."); return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
} }
var text = (await ExtractTextAsync(file, extension)).Trim(); string text;
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
if (canUseAiExtraction)
{
await using var uploadStream = file.OpenReadStream();
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted);
text = extracted?.Text?.Trim() ?? string.Empty;
}
else
{
text = string.Empty;
}
if (string.IsNullOrWhiteSpace(text))
{
text = (await ExtractTextAsync(file, extension)).Trim();
}
if (string.IsNullOrWhiteSpace(text)) if (string.IsNullOrWhiteSpace(text))
{ {
return BadRequest("The uploaded CV file could not be read or was empty."); return BadRequest("The uploaded CV file could not be read or was empty.");
+5 -3
View File
@@ -116,10 +116,12 @@ builder.Services.AddHttpClient("jobimport")
AutomaticDecompression = DecompressionMethods.All AutomaticDecompression = DecompressionMethods.All
}); });
// Local summarizer service (FastAPI). Default URL can be overridden via configuration `Summarizer:BaseUrl`. // Local AI service (FastAPI). Supports summarization and OCR/text extraction.
builder.Services.AddHttpClient("summarizer", client => builder.Services.AddHttpClient("ai-service", client =>
{ {
var baseUrl = builder.Configuration["Summarizer:BaseUrl"] ?? "http://127.0.0.1:8001"; var baseUrl = builder.Configuration["Ai:BaseUrl"]
?? builder.Configuration["Summarizer:BaseUrl"]
?? "http://127.0.0.1:8001";
client.BaseAddress = new Uri(baseUrl); client.BaseAddress = new Uri(baseUrl);
client.Timeout = TimeSpan.FromSeconds(30); client.Timeout = TimeSpan.FromSeconds(30);
}); });
+130 -45
View File
@@ -13,12 +13,14 @@ using Microsoft.Extensions.Logging;
namespace JobTrackerApi.Services namespace JobTrackerApi.Services
{ {
public sealed record SummarizerMetrics( public sealed record AiServiceMetrics(
bool Healthy, bool Healthy,
string? Model, string? Model,
string? Device, string? Device,
bool? GpuAvailable, bool? GpuAvailable,
string? GpuName, string? GpuName,
bool? OcrAvailable,
string? OcrLanguages,
double? HealthLatencyMs, double? HealthLatencyMs,
double? ProbeLatencyMs, double? ProbeLatencyMs,
DateTimeOffset? LastProbeAt, DateTimeOffset? LastProbeAt,
@@ -30,17 +32,36 @@ namespace JobTrackerApi.Services
int CacheMisses, int CacheMisses,
int Failures, int Failures,
double? AverageLatencyMs, double? AverageLatencyMs,
int OcrRequests,
int OcrFailures,
double? AverageOcrLatencyMs,
DateTimeOffset? LastOcrSuccessAt,
DateTimeOffset? LastOcrFailureAt,
DateTimeOffset? LastSuccessAt, DateTimeOffset? LastSuccessAt,
DateTimeOffset? LastFailureAt, DateTimeOffset? LastFailureAt,
string? LastError string? LastError
); );
public interface ISummarizerService public sealed record AiTextExtractionResult(
string? Text,
bool OcrUsed,
string? ContentType,
int? PageCount,
int Characters,
string? FileName
);
public interface IAiService
{ {
Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30); Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30);
Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40); Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40);
Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default);
Task RunProbeAsync(CancellationToken cancellationToken = default); Task RunProbeAsync(CancellationToken cancellationToken = default);
Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default); Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default);
}
public interface ISummarizerService : IAiService
{
} }
public class SummarizerService : ISummarizerService public class SummarizerService : ISummarizerService
@@ -60,6 +81,11 @@ namespace JobTrackerApi.Services
private DateTimeOffset? _lastProbeSuccessAt; private DateTimeOffset? _lastProbeSuccessAt;
private DateTimeOffset? _lastProbeFailureAt; private DateTimeOffset? _lastProbeFailureAt;
private int _probeFailures; private int _probeFailures;
private int _ocrRequests;
private int _ocrFailures;
private long _totalOcrLatencyTicks;
private DateTimeOffset? _lastOcrSuccessAt;
private DateTimeOffset? _lastOcrFailureAt;
private string? _lastError; private string? _lastError;
public SummarizerService(IHttpClientFactory httpFactory, IMemoryCache cache) public SummarizerService(IHttpClientFactory httpFactory, IMemoryCache cache)
@@ -78,22 +104,18 @@ namespace JobTrackerApi.Services
public async Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30) public async Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30)
{ {
if (string.IsNullOrWhiteSpace(text)) return null; if (string.IsNullOrWhiteSpace(text)) return null;
return await SummarizeCoreAsync(text, maxLength, minLength); return await SummarizeCoreAsync(text, maxLength, minLength);
} }
public Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40) public Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40)
{ {
if (string.IsNullOrWhiteSpace(instruction) || string.IsNullOrWhiteSpace(text)) return Task.FromResult<string?>(null); if (string.IsNullOrWhiteSpace(instruction) || string.IsNullOrWhiteSpace(text)) return Task.FromResult<string?>(null);
var composed = $"{instruction.Trim()}\n\n{text.Trim()}"; var composed = $"{instruction.Trim()}\n\n{text.Trim()}";
return SummarizeCoreAsync(composed, maxLength, minLength); return SummarizeCoreAsync(composed, maxLength, minLength);
} }
private async Task<string?> SummarizeCoreAsync(string text, int maxLength, int minLength) private async Task<string?> SummarizeCoreAsync(string text, int maxLength, int minLength)
{ {
// Use a deterministic content hash instead of string.GetHashCode() so cache keys
// are collision-resistant and stable across process restarts.
var key = BuildCacheKey(text, maxLength, minLength); var key = BuildCacheKey(text, maxLength, minLength);
Interlocked.Increment(ref _requests); Interlocked.Increment(ref _requests);
@@ -110,7 +132,7 @@ namespace JobTrackerApi.Services
Interlocked.Increment(ref _cacheMisses); Interlocked.Increment(ref _cacheMisses);
var client = _httpFactory.CreateClient("summarizer"); var client = _httpFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { text, max_length = maxLength, min_length = minLength }); var payload = JsonSerializer.Serialize(new { text, max_length = maxLength, min_length = minLength });
using var content = new StringContent(payload, Encoding.UTF8, "application/json"); using var content = new StringContent(payload, Encoding.UTF8, "application/json");
var sw = Stopwatch.StartNew(); var sw = Stopwatch.StartNew();
@@ -152,10 +174,74 @@ namespace JobTrackerApi.Services
} }
} }
public async Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(stream);
if (string.IsNullOrWhiteSpace(fileName)) fileName = "document";
Interlocked.Increment(ref _ocrRequests);
var client = _httpFactory.CreateClient("ai-service");
var sw = Stopwatch.StartNew();
try
{
using var form = new MultipartFormDataContent();
using var fileContent = new StreamContent(stream);
if (!string.IsNullOrWhiteSpace(contentType))
{
fileContent.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue(contentType);
}
form.Add(fileContent, "file", fileName);
using var response = await client.PostAsync("/extract-text", form, cancellationToken);
sw.Stop();
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
if (!response.IsSuccessStatusCode)
{
Interlocked.Increment(ref _ocrFailures);
lock (_metricsLock)
{
_lastOcrFailureAt = DateTimeOffset.UtcNow;
_lastError = $"AI extraction returned {(int)response.StatusCode}.";
}
return null;
}
await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken);
using var doc = await JsonDocument.ParseAsync(responseStream, cancellationToken: cancellationToken);
var text = doc.RootElement.TryGetProperty("text", out var textEl) ? textEl.GetString() : null;
var ocrUsed = doc.RootElement.TryGetProperty("ocr_used", out var ocrEl) && ocrEl.ValueKind is JsonValueKind.True or JsonValueKind.False && ocrEl.GetBoolean();
var detectedContentType = doc.RootElement.TryGetProperty("content_type", out var contentTypeEl) ? contentTypeEl.GetString() : contentType;
int? pageCount = doc.RootElement.TryGetProperty("page_count", out var pageCountEl) && pageCountEl.ValueKind == JsonValueKind.Number ? pageCountEl.GetInt32() : null;
var characters = doc.RootElement.TryGetProperty("characters", out var charactersEl) && charactersEl.ValueKind == JsonValueKind.Number ? charactersEl.GetInt32() : (text?.Length ?? 0);
var returnedFileName = doc.RootElement.TryGetProperty("file_name", out var fileNameEl) ? fileNameEl.GetString() : fileName;
lock (_metricsLock)
{
_lastOcrSuccessAt = DateTimeOffset.UtcNow;
_lastError = null;
}
return new AiTextExtractionResult(text, ocrUsed, detectedContentType, pageCount, characters, returnedFileName);
}
catch (Exception ex)
{
sw.Stop();
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
Interlocked.Increment(ref _ocrFailures);
lock (_metricsLock)
{
_lastOcrFailureAt = DateTimeOffset.UtcNow;
_lastError = ex.Message;
}
return null;
}
}
public async Task RunProbeAsync(CancellationToken cancellationToken = default) public async Task RunProbeAsync(CancellationToken cancellationToken = default)
{ {
const string probeText = "Summarizer latency probe for job tracker telemetry."; const string probeText = "AI service latency probe for Jobbjakt telemetry.";
var client = _httpFactory.CreateClient("summarizer"); var client = _httpFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { text = probeText, max_length = 48, min_length = 12 }); var payload = JsonSerializer.Serialize(new { text = probeText, max_length = 48, min_length = 12 });
using var content = new StringContent(payload, Encoding.UTF8, "application/json"); using var content = new StringContent(payload, Encoding.UTF8, "application/json");
var sw = Stopwatch.StartNew(); var sw = Stopwatch.StartNew();
@@ -215,13 +301,15 @@ namespace JobTrackerApi.Services
} }
} }
public async Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default) public async Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default)
{ {
var client = _httpFactory.CreateClient("summarizer"); var client = _httpFactory.CreateClient("ai-service");
string? model = null; string? model = null;
string? device = null; string? device = null;
bool? gpuAvailable = null; bool? gpuAvailable = null;
string? gpuName = null; string? gpuName = null;
bool? ocrAvailable = null;
string? ocrLanguages = null;
double? healthLatencyMs = null; double? healthLatencyMs = null;
var healthy = false; var healthy = false;
string? healthError = null; string? healthError = null;
@@ -238,25 +326,12 @@ namespace JobTrackerApi.Services
{ {
using var stream = await res.Content.ReadAsStreamAsync(cancellationToken); using var stream = await res.Content.ReadAsStreamAsync(cancellationToken);
using var doc = await JsonDocument.ParseAsync(stream, cancellationToken: cancellationToken); using var doc = await JsonDocument.ParseAsync(stream, cancellationToken: cancellationToken);
if (doc.RootElement.TryGetProperty("model", out var modelEl)) if (doc.RootElement.TryGetProperty("model", out var modelEl)) model = modelEl.GetString();
{ if (doc.RootElement.TryGetProperty("device", out var deviceEl)) device = deviceEl.GetString();
model = modelEl.GetString(); if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) gpuAvailable = gpuAvailableEl.GetBoolean();
} if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("device", out var deviceEl)) if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
{
device = deviceEl.GetString();
}
if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False)
{
gpuAvailable = gpuAvailableEl.GetBoolean();
}
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl))
{
gpuName = gpuNameEl.GetString();
}
} }
else else
{ {
@@ -273,6 +348,9 @@ namespace JobTrackerApi.Services
var cacheMisses = Volatile.Read(ref _cacheMisses); var cacheMisses = Volatile.Read(ref _cacheMisses);
var failures = Volatile.Read(ref _failures); var failures = Volatile.Read(ref _failures);
var totalLatencyTicks = Volatile.Read(ref _totalLatencyTicks); var totalLatencyTicks = Volatile.Read(ref _totalLatencyTicks);
var ocrRequests = Volatile.Read(ref _ocrRequests);
var ocrFailures = Volatile.Read(ref _ocrFailures);
var totalOcrLatencyTicks = Volatile.Read(ref _totalOcrLatencyTicks);
DateTimeOffset? lastSuccessAt; DateTimeOffset? lastSuccessAt;
DateTimeOffset? lastFailureAt; DateTimeOffset? lastFailureAt;
@@ -280,6 +358,8 @@ namespace JobTrackerApi.Services
DateTimeOffset? lastProbeAt; DateTimeOffset? lastProbeAt;
DateTimeOffset? lastProbeSuccessAt; DateTimeOffset? lastProbeSuccessAt;
DateTimeOffset? lastProbeFailureAt; DateTimeOffset? lastProbeFailureAt;
DateTimeOffset? lastOcrSuccessAt;
DateTimeOffset? lastOcrFailureAt;
string? lastError; string? lastError;
lock (_metricsLock) lock (_metricsLock)
{ {
@@ -289,6 +369,8 @@ namespace JobTrackerApi.Services
lastProbeAt = _lastProbeAt; lastProbeAt = _lastProbeAt;
lastProbeSuccessAt = _lastProbeSuccessAt; lastProbeSuccessAt = _lastProbeSuccessAt;
lastProbeFailureAt = _lastProbeFailureAt; lastProbeFailureAt = _lastProbeFailureAt;
lastOcrSuccessAt = _lastOcrSuccessAt;
lastOcrFailureAt = _lastOcrFailureAt;
lastError = _lastError; lastError = _lastError;
} }
@@ -297,16 +379,17 @@ namespace JobTrackerApi.Services
lastError = healthError; lastError = healthError;
} }
double? averageLatencyMs = requests > 0 double? averageLatencyMs = requests > 0 ? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1) : null;
? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1) double? averageOcrLatencyMs = ocrRequests > 0 ? Math.Round(TimeSpan.FromTicks(totalOcrLatencyTicks).TotalMilliseconds / ocrRequests, 1) : null;
: null;
return new SummarizerMetrics( return new AiServiceMetrics(
Healthy: healthy, Healthy: healthy,
Model: model, Model: model,
Device: device, Device: device,
GpuAvailable: gpuAvailable, GpuAvailable: gpuAvailable,
GpuName: gpuName, GpuName: gpuName,
OcrAvailable: ocrAvailable,
OcrLanguages: ocrLanguages,
HealthLatencyMs: healthLatencyMs, HealthLatencyMs: healthLatencyMs,
ProbeLatencyMs: probeLatencyMs, ProbeLatencyMs: probeLatencyMs,
LastProbeAt: lastProbeAt, LastProbeAt: lastProbeAt,
@@ -318,6 +401,11 @@ namespace JobTrackerApi.Services
CacheMisses: cacheMisses, CacheMisses: cacheMisses,
Failures: failures, Failures: failures,
AverageLatencyMs: averageLatencyMs, AverageLatencyMs: averageLatencyMs,
OcrRequests: ocrRequests,
OcrFailures: ocrFailures,
AverageOcrLatencyMs: averageOcrLatencyMs,
LastOcrSuccessAt: lastOcrSuccessAt,
LastOcrFailureAt: lastOcrFailureAt,
LastSuccessAt: lastSuccessAt, LastSuccessAt: lastSuccessAt,
LastFailureAt: lastFailureAt, LastFailureAt: lastFailureAt,
LastError: lastError LastError: lastError
@@ -340,14 +428,11 @@ namespace JobTrackerApi.Services
protected override async Task ExecuteAsync(CancellationToken stoppingToken) protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{ {
var enabled = _cfg.GetValue("Summarizer:ProbeEnabled", true); var enabled = _cfg.GetValue("Ai:ProbeEnabled", _cfg.GetValue("Summarizer:ProbeEnabled", true));
if (!enabled) if (!enabled) return;
{
return;
}
var intervalSeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300), 30, 3600); var intervalSeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeIntervalSeconds", _cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300)), 30, 3600);
var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15), 0, 600); var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeInitialDelaySeconds", _cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15)), 0, 600);
if (initialDelaySeconds > 0) if (initialDelaySeconds > 0)
{ {
@@ -360,8 +445,8 @@ namespace JobTrackerApi.Services
try try
{ {
using var scope = _scopeFactory.CreateScope(); using var scope = _scopeFactory.CreateScope();
var summarizer = scope.ServiceProvider.GetRequiredService<ISummarizerService>(); var aiService = scope.ServiceProvider.GetRequiredService<ISummarizerService>();
await summarizer.RunProbeAsync(stoppingToken); await aiService.RunProbeAsync(stoppingToken);
} }
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{ {
@@ -369,7 +454,7 @@ namespace JobTrackerApi.Services
} }
catch (Exception ex) catch (Exception ex)
{ {
_logger.LogWarning(ex, "Summarizer latency probe failed."); _logger.LogWarning(ex, "AI service latency probe failed.");
} }
} }
while (await timer.WaitForNextTickAsync(stoppingToken)); while (await timer.WaitForNextTickAsync(stoppingToken));
+8 -8
View File
@@ -12,7 +12,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
- History/event trail per application (created, status changes, follow-up set, delete/restore) - History/event trail per application (created, status changes, follow-up set, delete/restore)
- Export jobs to JSON/CSV + daily scheduled JSON export - Export jobs to JSON/CSV + daily scheduled JSON export
- Optional “job import” preview from supported job sites (plugins) + optional translation to English - Optional “job import” preview from supported job sites (plugins) + optional translation to English
- Optional local summarizer service for short/full descriptions - Optional local AI service for short/full descriptions
- Optional Google sign-in (Google ID tokens) to protect the API - Optional Google sign-in (Google ID tokens) to protect the API
## Architecture ## Architecture
@@ -21,11 +21,11 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
- `JobTrackerApi/`: ASP.NET Core API (defaults to `http://localhost:5202`) - `JobTrackerApi/`: ASP.NET Core API (defaults to `http://localhost:5202`)
- SQLite DB file: defaults to `JobTrackerApi/jobtracker.db` unless `Data:Root` / connection string overrides it - SQLite DB file: defaults to `JobTrackerApi/jobtracker.db` unless `Data:Root` / connection string overrides it
- Attachments: stored on disk under `DataRoot/Attachments/<jobId>/...` - Attachments: stored on disk under `DataRoot/Attachments/<jobId>/...`
- Optional local summarizer service: `tools/summarizer/` (FastAPI) used by the API via `Summarizer:BaseUrl` - Optional local AI service: `tools/summarizer/` (FastAPI) used by the API via `Ai:BaseUrl`
## Quickstart (Docker) ## Quickstart (Docker)
This runs: frontend (nginx), backend API, and the summarizer service. This runs: frontend (nginx), backend API, and the AI service.
1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`). 1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`).
@@ -43,7 +43,7 @@ docker compose up --build
- .NET SDK `9.x` (API targets `net9.0`) - .NET SDK `9.x` (API targets `net9.0`)
- Node.js (for the UI) - Node.js (for the UI)
- (Optional) Python 3.x if running the summarizer without Docker - (Optional) Python 3.x if running the AI service without Docker
### 1) Run the API ### 1) Run the API
@@ -65,14 +65,14 @@ npm start
The UI defaults to calling `http://localhost:5202/api` when running on localhost (see `job-tracker-ui/src/api.ts`). The UI defaults to calling `http://localhost:5202/api` when running on localhost (see `job-tracker-ui/src/api.ts`).
### 3) (Optional) Run the summarizer ### 3) (Optional) Run the AI service
The API calls a local FastAPI service to generate summaries. If its not running, the app still works (summary generation may be empty / best-effort). The API calls a local FastAPI service to generate summaries. If its not running, the app still works (summary generation may be empty / best-effort).
With Docker (recommended): With Docker (recommended):
```bash ```bash
docker compose up --build summarizer docker compose up --build ai-service
``` ```
Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`). Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`).
@@ -87,7 +87,7 @@ Common keys:
- `Data:Root`: folder for the SQLite DB + exports (defaults to API content root) - `Data:Root`: folder for the SQLite DB + exports (defaults to API content root)
- `Data:AttachmentsRoot`: override attachments folder (defaults to `<Data:Root>/Attachments`) - `Data:AttachmentsRoot`: override attachments folder (defaults to `<Data:Root>/Attachments`)
- `Cors:Origins`: list of allowed origins (defaults to `http://localhost:3000`; use `"*"` to allow all) - `Cors:Origins`: list of allowed origins (defaults to `http://localhost:3000`; use `"*"` to allow all)
- `Summarizer:BaseUrl`: summarizer base URL (default `http://127.0.0.1:8001`) - `Ai:BaseUrl`: AI service base URL (default `http://127.0.0.1:8001`)
- `Exports:DailyEnabled`: enable/disable daily export background job - `Exports:DailyEnabled`: enable/disable daily export background job
- `Exports:DailyFolder`: export destination (relative to `Data:Root` if not absolute) - `Exports:DailyFolder`: export destination (relative to `Data:Root` if not absolute)
- `Exports:DailyHourLocal`: local hour (023) when the daily export runs - `Exports:DailyHourLocal`: local hour (023) when the daily export runs
@@ -109,7 +109,7 @@ Common keys:
- `Email:SmtpUser`: SMTP username (often your Gmail address) - `Email:SmtpUser`: SMTP username (often your Gmail address)
- `Email:SmtpPassword`: SMTP password (for Gmail: use an App Password) - `Email:SmtpPassword`: SMTP password (for Gmail: use an App Password)
- `Email:From`: from address (default: `Email:SmtpUser`) - `Email:From`: from address (default: `Email:SmtpUser`)
- `Email:FromName`: from name (default: `Job Tracker`) - `Email:FromName`: from name (default: `Jobbjakt`)
### UI settings ### UI settings
+4 -2
View File
@@ -51,7 +51,9 @@ AUTH_JWT_KEY=replace_with_long_random_secret
AUTH_ADMIN_EMAIL=you@example.com AUTH_ADMIN_EMAIL=you@example.com
AUTH_ADMIN_PASSWORD=replace_with_strong_password AUTH_ADMIN_PASSWORD=replace_with_strong_password
APP_PUBLIC_BASE_URL=https://your-domain.example APP_PUBLIC_BASE_URL=https://your-domain.example
SUMMARIZER_BASE_URL=http://summarizer:8001 AI_SERVICE_BASE_URL=http://ai-service:8001
# Optional backward-compatible alias if older config still references the previous name:
SUMMARIZER_BASE_URL=http://ai-service:8001
``` ```
## Database recommendation ## Database recommendation
@@ -89,5 +91,5 @@ If this app is going to be a real production service on Ubuntu:
- confirm reverse proxy routes to the frontend correctly - confirm reverse proxy routes to the frontend correctly
- confirm API auth/login works with production config - confirm API auth/login works with production config
- confirm backend can connect to MariaDB - confirm backend can connect to MariaDB
- confirm summarizer container is reachable from backend - confirm AI service container is reachable from backend
- confirm reminder and admin/system pages load - confirm reminder and admin/system pages load
+3 -2
View File
@@ -23,7 +23,8 @@ services:
- Auth__GoogleClientId=${AUTH_GOOGLE_CLIENT_ID} - Auth__GoogleClientId=${AUTH_GOOGLE_CLIENT_ID}
- Google__GmailClientSecret=${GOOGLE_GMAIL_CLIENT_SECRET} - Google__GmailClientSecret=${GOOGLE_GMAIL_CLIENT_SECRET}
- Google__GmailRedirectUri=${GOOGLE_GMAIL_REDIRECT_URI} - Google__GmailRedirectUri=${GOOGLE_GMAIL_REDIRECT_URI}
- Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://summarizer:8001} - Ai__BaseUrl=${AI_SERVICE_BASE_URL:-http://ai-service:8001}
- Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://ai-service:8001}
# Email (SMTP) # Email (SMTP)
# Build metadata should be resolved before deployment. Examples: # Build metadata should be resolved before deployment. Examples:
# APP_VERSION=1.0.0 # APP_VERSION=1.0.0
@@ -66,7 +67,7 @@ services:
- shared_services - shared_services
restart: unless-stopped restart: unless-stopped
summarizer: ai-service:
build: build:
context: ./tools/summarizer context: ./tools/summarizer
dockerfile: Dockerfile dockerfile: Dockerfile
+11 -1
View File
@@ -2,6 +2,16 @@
Last updated: 2026-03-23 Last updated: 2026-03-23
## AI Service / OCR
- [x] Reframe user-facing "summarizer" status and docs toward an AI service
- [x] Add self-hosted OCR/text extraction endpoint to the local AI service
- [x] Add backend AI-service text extraction integration for profile CV uploads
- [x] Add OCR support for supported image CV uploads (`png`, `jpg`, `jpeg`, `webp`)
- [x] Add AI service latency/OCR telemetry to the system page
- [x] Add frontend test coverage for AI service status rendering
- [ ] Extend AI extraction to job attachment ingestion
- [ ] Consider full internal service/class rename from `Summarizer*` to `AiService*`
## Build / UI Issues ## Build / UI Issues
- [x] Fix visible build error text appearing on page load/footer - [x] Fix visible build error text appearing on page load/footer
- [x] Resolve naming inconsistency: `jobtrack``Jobbjakt` - [x] Resolve naming inconsistency: `jobtrack``Jobbjakt`
@@ -32,7 +42,7 @@ Last updated: 2026-03-23
- [x] Add zoom in/out support for image cropping - [x] Add zoom in/out support for image cropping
- [x] Use square cropped avatar output - [x] Use square cropped avatar output
- [x] Add CV upload support - [x] Add CV upload support
- [ ] Verify/complete OCR/text extraction for uploaded CV PDFs - [x] Verify/complete OCR/text extraction for uploaded CV PDFs
## Settings & System ## Settings & System
- [x] Restore missing follow-up days settings - [x] Restore missing follow-up days settings
@@ -0,0 +1,75 @@
import React from 'react';
import { render, screen, waitFor } from '@testing-library/react';
import AdminSystemPage from './pages/AdminSystemPage';
import { I18nProvider } from './i18n/I18nProvider';
import { api } from './api';
const mockedApi = api as jest.Mocked<typeof api>;
describe('AdminSystemPage', () => {
it('renders AI service health, latency, and OCR readiness', async () => {
mockedApi.get.mockImplementation((url: string) => {
if (url === '/admin/system') {
return Promise.resolve({
data: {
environment: 'Production',
contentRoot: '/app',
version: '1.2.3',
commitSha: 'abc1234',
buildStamp: '2026-03-23 11:00 UTC',
storage: { dataRoot: '/data', dbPath: '/data/jobtracker.db', dbExists: true, dbSizeBytes: 2048, companyCount: 3, jobCount: 7, deletedCount: 1 },
email: { enabled: true, host: 'smtp.example.test', port: 587, enableSsl: true, from: 'noreply@example.test', fromName: 'Jobbjakt' },
database: { provider: 'mariadb', looksConfigured: true, canConnect: true, target: 'server=db', usesFileStorage: false, warning: null },
runtime: { framework: '.NET 9', osDescription: 'Linux', processArchitecture: 'X64', machineName: 'app-01' },
auth: { required: true, hasJwtKey: true, googleConfigured: true, gmailConfigured: true },
ai: {
healthy: true,
model: 'distilbart',
device: 'cpu',
gpuAvailable: false,
gpuName: null,
ocrAvailable: true,
ocrLanguages: 'eng',
healthLatencyMs: 12.4,
probeLatencyMs: 25.8,
lastProbeAt: '2026-03-23T10:00:00Z',
lastProbeSuccessAt: '2026-03-23T10:00:00Z',
lastProbeFailureAt: null,
probeFailures: 0,
requests: 18,
cacheHits: 9,
cacheMisses: 9,
failures: 0,
averageLatencyMs: 42.2,
ocrRequests: 5,
ocrFailures: 0,
averageOcrLatencyMs: 88.4,
lastOcrSuccessAt: '2026-03-23T10:05:00Z',
lastOcrFailureAt: null,
lastSuccessAt: '2026-03-23T10:04:00Z',
lastFailureAt: null,
lastError: null,
},
},
} as any);
}
return Promise.resolve({ data: {} } as any);
});
render(
<I18nProvider>
<AdminSystemPage />
</I18nProvider>,
);
await waitFor(() => {
expect(screen.getByText('AI service')).toBeTruthy();
});
expect(screen.getByText(/25.8 ms probe/i)).toBeTruthy();
expect(screen.getByText('OCR eng')).toBeTruthy();
expect(screen.getByText('OCR avg latency')).toBeTruthy();
expect(screen.getByText('88.4 ms')).toBeTruthy();
});
});
+12 -12
View File
@@ -171,7 +171,7 @@ export const translations = {
profileHeadline: "Profile headline", profileHeadline: "Profile headline",
profileHeadlineHelp: "Stored only in this browser to personalize your workspace.", profileHeadlineHelp: "Stored only in this browser to personalize your workspace.",
profileMasterCv: "Master CV", profileMasterCv: "Master CV",
profileMasterCvBody: "Upload a PDF, DOCX, plain text file, or markdown file. The app extracts text where supported and populates your master CV text for tailoring and outreach.", profileMasterCvBody: "Upload a PDF, DOCX, plain text file, markdown file, or image scan. The AI service extracts text where possible and falls back to OCR for supported scanned files.",
profileUploadCv: "Upload CV", profileUploadCv: "Upload CV",
profileUploading: "Uploading...", profileUploading: "Uploading...",
profileCopyCvText: "Copy CV text", profileCopyCvText: "Copy CV text",
@@ -179,7 +179,7 @@ export const translations = {
profileCvUploadFailed: "Failed to upload CV.", profileCvUploadFailed: "Failed to upload CV.",
profileCvTextLabel: "Profile CV / master resume text", profileCvTextLabel: "Profile CV / master resume text",
profileCvTextHelp: "Keep this updated and specific. Include recent roles, tools, achievements, measurable outcomes, and the work you want to be hired for next. If extraction misses something, edit it here manually.", profileCvTextHelp: "Keep this updated and specific. Include recent roles, tools, achievements, measurable outcomes, and the work you want to be hired for next. If extraction misses something, edit it here manually.",
profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD.", profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.",
profileSaveChanges: "Save changes", profileSaveChanges: "Save changes",
profileUpdated: "Profile updated.", profileUpdated: "Profile updated.",
profileUpdateFailed: "Failed to update profile.", profileUpdateFailed: "Failed to update profile.",
@@ -272,7 +272,7 @@ export const translations = {
adminUsersCreated: "User created.", adminUsersCreated: "User created.",
adminUsersCreateFailed: "Failed to create user.", adminUsersCreateFailed: "Failed to create user.",
adminSystemTitle: "System status", adminSystemTitle: "System status",
adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, and summarizer health.", adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, AI service health, and OCR readiness.",
adminSystemRunProbe: "Run probe now", adminSystemRunProbe: "Run probe now",
adminSystemRunningProbe: "Running probe...", adminSystemRunningProbe: "Running probe...",
adminSystemRefresh: "Refresh", adminSystemRefresh: "Refresh",
@@ -284,13 +284,13 @@ export const translations = {
adminSystemSmtp: "SMTP", adminSystemSmtp: "SMTP",
adminSystemEnabled: "Enabled", adminSystemEnabled: "Enabled",
adminSystemDisabled: "Disabled", adminSystemDisabled: "Disabled",
adminSystemSummarizer: "Summarizer", adminSystemSummarizer: "AI service",
adminSystemHealthy: "Healthy", adminSystemHealthy: "Healthy",
adminSystemNoLatencyData: "No latency data", adminSystemNoLatencyData: "No latency data",
adminSystemDatabaseStorage: "Database and storage", adminSystemDatabaseStorage: "Database and storage",
adminSystemRuntimeAuth: "Runtime and auth", adminSystemRuntimeAuth: "Runtime and auth",
adminSystemEmailConfig: "Email configuration", adminSystemEmailConfig: "Email configuration",
adminSystemSummarizerRuntime: "Summarizer runtime", adminSystemSummarizerRuntime: "AI runtime",
adminSystemSmtpTest: "SMTP test email", adminSystemSmtpTest: "SMTP test email",
adminSystemSmtpTestBody: "Send a quick delivery check using the configured SMTP settings. Leave the recipient blank to use your admin email.", adminSystemSmtpTestBody: "Send a quick delivery check using the configured SMTP settings. Leave the recipient blank to use your admin email.",
adminSystemRecipientEmail: "Recipient email", adminSystemRecipientEmail: "Recipient email",
@@ -299,7 +299,7 @@ export const translations = {
adminSystemMessage: "Message", adminSystemMessage: "Message",
adminSystemSendTestEmail: "Send test email", adminSystemSendTestEmail: "Send test email",
adminSystemSending: "Sending...", adminSystemSending: "Sending...",
adminSystemSummarizerTelemetry: "Summarizer telemetry", adminSystemSummarizerTelemetry: "AI service telemetry",
adminSystemDatabaseConnected: "Database connected", adminSystemDatabaseConnected: "Database connected",
adminSystemDatabaseIssue: "Database issue", adminSystemDatabaseIssue: "Database issue",
adminSystemAuthEnforced: "Auth enforced", adminSystemAuthEnforced: "Auth enforced",
@@ -591,7 +591,7 @@ export const translations = {
profileHeadline: "Profiloverskrift", profileHeadline: "Profiloverskrift",
profileHeadlineHelp: "Lagres bare i denne nettleseren for å gjøre arbeidsområdet mer personlig.", profileHeadlineHelp: "Lagres bare i denne nettleseren for å gjøre arbeidsområdet mer personlig.",
profileMasterCv: "Hoved-CV", profileMasterCv: "Hoved-CV",
profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil eller markdown-fil. Appen henter ut tekst der det støttes og fyller inn hoved-CV-en din for tilpasning og kontakt.", profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil, markdown-fil eller et bildeskann. AI-tjenesten henter ut tekst der det er mulig og faller tilbake til OCR for støttede skannede filer.",
profileUploadCv: "Last opp CV", profileUploadCv: "Last opp CV",
profileUploading: "Laster opp...", profileUploading: "Laster opp...",
profileCopyCvText: "Kopier CV-tekst", profileCopyCvText: "Kopier CV-tekst",
@@ -599,7 +599,7 @@ export const translations = {
profileCvUploadFailed: "Kunne ikke laste opp CV.", profileCvUploadFailed: "Kunne ikke laste opp CV.",
profileCvTextLabel: "Profil-CV / hovedtekst for CV", profileCvTextLabel: "Profil-CV / hovedtekst for CV",
profileCvTextHelp: "Hold denne oppdatert og konkret. Ta med nylige roller, verktøy, prestasjoner, målbare resultater og arbeidet du vil bli ansatt for neste gang. Hvis tekstuttrekket mangler noe, kan du redigere manuelt her.", profileCvTextHelp: "Hold denne oppdatert og konkret. Ta med nylige roller, verktøy, prestasjoner, målbare resultater og arbeidet du vil bli ansatt for neste gang. Hvis tekstuttrekket mangler noe, kan du redigere manuelt her.",
profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD.", profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.",
profileSaveChanges: "Lagre endringer", profileSaveChanges: "Lagre endringer",
profileUpdated: "Profil oppdatert.", profileUpdated: "Profil oppdatert.",
profileUpdateFailed: "Kunne ikke oppdatere profil.", profileUpdateFailed: "Kunne ikke oppdatere profil.",
@@ -692,7 +692,7 @@ export const translations = {
adminUsersCreated: "Bruker opprettet.", adminUsersCreated: "Bruker opprettet.",
adminUsersCreateFailed: "Kunne ikke opprette bruker.", adminUsersCreateFailed: "Kunne ikke opprette bruker.",
adminSystemTitle: "Systemstatus", adminSystemTitle: "Systemstatus",
adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post og oppsummeringshelse.", adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post, AI-tjenestehelse og OCR-beredskap.",
adminSystemRunProbe: "Kjør probe nå", adminSystemRunProbe: "Kjør probe nå",
adminSystemRunningProbe: "Kjører probe...", adminSystemRunningProbe: "Kjører probe...",
adminSystemRefresh: "Oppdater", adminSystemRefresh: "Oppdater",
@@ -704,13 +704,13 @@ export const translations = {
adminSystemSmtp: "SMTP", adminSystemSmtp: "SMTP",
adminSystemEnabled: "Aktivert", adminSystemEnabled: "Aktivert",
adminSystemDisabled: "Deaktivert", adminSystemDisabled: "Deaktivert",
adminSystemSummarizer: "Oppsummerer", adminSystemSummarizer: "AI-tjeneste",
adminSystemHealthy: "Frisk", adminSystemHealthy: "Frisk",
adminSystemNoLatencyData: "Ingen latensdata", adminSystemNoLatencyData: "Ingen latensdata",
adminSystemDatabaseStorage: "Database og lagring", adminSystemDatabaseStorage: "Database og lagring",
adminSystemRuntimeAuth: "Kjøretid og autentisering", adminSystemRuntimeAuth: "Kjøretid og autentisering",
adminSystemEmailConfig: "E-postkonfigurasjon", adminSystemEmailConfig: "E-postkonfigurasjon",
adminSystemSummarizerRuntime: "Oppsummeringskjøretid", adminSystemSummarizerRuntime: "AI-kjøretid",
adminSystemSmtpTest: "SMTP-test e-post", adminSystemSmtpTest: "SMTP-test e-post",
adminSystemSmtpTestBody: "Send en rask leveringssjekk med de konfigurerte SMTP-innstillingene. La mottakeren stå tom for å bruke admin-eposten din.", adminSystemSmtpTestBody: "Send en rask leveringssjekk med de konfigurerte SMTP-innstillingene. La mottakeren stå tom for å bruke admin-eposten din.",
adminSystemRecipientEmail: "Mottaker e-post", adminSystemRecipientEmail: "Mottaker e-post",
@@ -719,7 +719,7 @@ export const translations = {
adminSystemMessage: "Melding", adminSystemMessage: "Melding",
adminSystemSendTestEmail: "Send test-e-post", adminSystemSendTestEmail: "Send test-e-post",
adminSystemSending: "Sender...", adminSystemSending: "Sender...",
adminSystemSummarizerTelemetry: "Oppsummeringstelemetri", adminSystemSummarizerTelemetry: "AI-tjenestetelemetri",
adminSystemDatabaseConnected: "Database tilkoblet", adminSystemDatabaseConnected: "Database tilkoblet",
adminSystemDatabaseIssue: "Databaseproblem", adminSystemDatabaseIssue: "Databaseproblem",
adminSystemAuthEnforced: "Autentisering påkrevd", adminSystemAuthEnforced: "Autentisering påkrevd",
+41 -31
View File
@@ -14,12 +14,14 @@ import {
import { api, getApiErrorMessage } from "../api"; import { api, getApiErrorMessage } from "../api";
import { useI18n } from "../i18n/I18nProvider"; import { useI18n } from "../i18n/I18nProvider";
type SummarizerMetrics = { type AiServiceMetrics = {
healthy: boolean; healthy: boolean;
model?: string | null; model?: string | null;
device?: string | null; device?: string | null;
gpuAvailable?: boolean; gpuAvailable?: boolean;
gpuName?: string | null; gpuName?: string | null;
ocrAvailable?: boolean | null;
ocrLanguages?: string | null;
healthLatencyMs?: number | null; healthLatencyMs?: number | null;
probeLatencyMs?: number | null; probeLatencyMs?: number | null;
lastProbeAt?: string | null; lastProbeAt?: string | null;
@@ -31,6 +33,11 @@ type SummarizerMetrics = {
cacheMisses: number; cacheMisses: number;
failures: number; failures: number;
averageLatencyMs?: number | null; averageLatencyMs?: number | null;
ocrRequests: number;
ocrFailures: number;
averageOcrLatencyMs?: number | null;
lastOcrSuccessAt?: string | null;
lastOcrFailureAt?: string | null;
lastSuccessAt?: string | null; lastSuccessAt?: string | null;
lastFailureAt?: string | null; lastFailureAt?: string | null;
lastError?: string | null; lastError?: string | null;
@@ -79,7 +86,7 @@ type SystemStatus = {
googleConfigured: boolean; googleConfigured: boolean;
gmailConfigured: boolean; gmailConfigured: boolean;
}; };
summarizer: SummarizerMetrics; ai: AiServiceMetrics;
}; };
function formatBytes(bytes?: number | null) { function formatBytes(bytes?: number | null) {
@@ -148,10 +155,10 @@ export default function AdminSystemPage() {
return "success" as const; return "success" as const;
}, [status]); }, [status]);
const summarizerTone = useMemo(() => { const aiTone = useMemo(() => {
if (!status) return "default" as const; if (!status) return "default" as const;
if (!status.summarizer.healthy) return "error" as const; if (!status.ai.healthy) return "error" as const;
if (status.summarizer.probeFailures > 0 || status.summarizer.failures > 0) return "warning" as const; if (status.ai.probeFailures > 0 || status.ai.failures > 0 || (status.ai.ocrFailures ?? 0) > 0) return "warning" as const;
return "success" as const; return "success" as const;
}, [status]); }, [status]);
@@ -184,10 +191,10 @@ export default function AdminSystemPage() {
setRunningProbe(true); setRunningProbe(true);
setError(null); setError(null);
try { try {
await api.post("/admin/system/summarizer/probe"); await api.post("/admin/system/ai/probe");
await load(); await load();
} catch (e: any) { } catch (e: any) {
setError(getApiErrorMessage(e, "Failed to run summarizer probe.")); setError(getApiErrorMessage(e, "Failed to run AI service probe."));
} finally { } finally {
setRunningProbe(false); setRunningProbe(false);
} }
@@ -204,7 +211,7 @@ export default function AdminSystemPage() {
{error ? <Alert severity="error">{error}</Alert> : null} {error ? <Alert severity="error">{error}</Alert> : null}
{status?.database.warning ? <Alert severity={status.database.canConnect ? "warning" : "error"}>{status.database.warning}</Alert> : null} {status?.database.warning ? <Alert severity={status.database.canConnect ? "warning" : "error"}>{status.database.warning}</Alert> : null}
{status?.summarizer.lastError ? <Alert severity={status.summarizer.healthy ? "warning" : "error"}>{status.summarizer.lastError}</Alert> : null} {status?.ai.lastError ? <Alert severity={status.ai.healthy ? "warning" : "error"}>{status.ai.lastError}</Alert> : null}
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr", md: "repeat(4, 1fr)" }, gap: 2 }}> <Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr", md: "repeat(4, 1fr)" }, gap: 2 }}>
<SummaryCard <SummaryCard
@@ -226,13 +233,13 @@ export default function AdminSystemPage() {
/> />
<SummaryCard <SummaryCard
title={t("adminSystemSummarizer")} title={t("adminSystemSummarizer")}
value={status?.summarizer.healthy ? t("adminSystemHealthy") : t("adminSystemOffline")} value={status?.ai.healthy ? t("adminSystemHealthy") : t("adminSystemOffline")}
subtitle={status?.summarizer.probeLatencyMs != null subtitle={status?.ai.probeLatencyMs != null
? `${status.summarizer.probeLatencyMs} ms probe · ${status.summarizer.device || "unknown device"}` ? `${status.ai.probeLatencyMs} ms probe · ${status.ai.device || "unknown device"}`
: status?.summarizer.healthLatencyMs != null : status?.ai.healthLatencyMs != null
? `${status.summarizer.healthLatencyMs} ms health · ${status.summarizer.device || "unknown device"}` ? `${status.ai.healthLatencyMs} ms health · ${status.ai.device || "unknown device"}`
: t("adminSystemNoLatencyData")} : t("adminSystemNoLatencyData")}
tone={summarizerTone} tone={aiTone}
/> />
</Box> </Box>
@@ -288,15 +295,15 @@ export default function AdminSystemPage() {
<Paper sx={{ p: 2, borderRadius: 3 }}> <Paper sx={{ p: 2, borderRadius: 3 }}>
<Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerRuntime")}</Typography> <Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerRuntime")}</Typography>
<Stack spacing={0.75}> <Stack spacing={0.75}>
<DetailRow label="Model" value={status?.summarizer.model || "-"} /> <DetailRow label="Model" value={status?.ai.model || "-"} />
<DetailRow label="Device" value={status?.summarizer.device || "-"} /> <DetailRow label="Device" value={status?.ai.device || "-"} />
<DetailRow label="GPU available" value={status?.summarizer.gpuAvailable ? "Yes" : "No"} /> <DetailRow label="GPU available" value={status?.ai.gpuAvailable ? "Yes" : "No"} />
<DetailRow label="GPU name" value={status?.summarizer.gpuName || "-"} /> <DetailRow label="GPU name" value={status?.ai.gpuName || "-"} />
<DetailRow label="Health latency" value={status?.summarizer.healthLatencyMs != null ? `${status.summarizer.healthLatencyMs} ms` : "-"} /> <DetailRow label="Health latency" value={status?.ai.healthLatencyMs != null ? `${status.ai.healthLatencyMs} ms` : "-"} />
<DetailRow label="Probe latency" value={status?.summarizer.probeLatencyMs != null ? `${status.summarizer.probeLatencyMs} ms` : "-"} /> <DetailRow label="Probe latency" value={status?.ai.probeLatencyMs != null ? `${status.ai.probeLatencyMs} ms` : "-"} />
<DetailRow label="Last probe" value={formatDate(status?.summarizer.lastProbeAt)} /> <DetailRow label="Last probe" value={formatDate(status?.ai.lastProbeAt)} />
<DetailRow label="Last successful probe" value={formatDate(status?.summarizer.lastProbeSuccessAt)} /> <DetailRow label="Last successful probe" value={formatDate(status?.ai.lastProbeSuccessAt)} />
<DetailRow label="Last summarization success" value={formatDate(status?.summarizer.lastSuccessAt)} /> <DetailRow label="Last summarization success" value={formatDate(status?.ai.lastSuccessAt)} />
</Stack> </Stack>
</Paper> </Paper>
</Box> </Box>
@@ -320,20 +327,23 @@ export default function AdminSystemPage() {
<Paper sx={{ p: 2, borderRadius: 3 }}> <Paper sx={{ p: 2, borderRadius: 3 }}>
<Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerTelemetry")}</Typography> <Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerTelemetry")}</Typography>
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr 1fr", md: "repeat(6, 1fr)" }, gap: 2 }}> <Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr 1fr", md: "repeat(8, 1fr)" }, gap: 2 }}>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.requests ?? 0}</Typography></Box> <Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.requests ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache hits</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.cacheHits ?? 0}</Typography></Box> <Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache hits</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.cacheHits ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache misses</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.cacheMisses ?? 0}</Typography></Box> <Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache misses</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.cacheMisses ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.failures ?? 0}</Typography></Box> <Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.failures ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Probe failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.probeFailures ?? 0}</Typography></Box> <Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Probe failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.probeFailures ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.averageLatencyMs != null ? `${status.summarizer.averageLatencyMs} ms` : "-"}</Typography></Box> <Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.averageLatencyMs != null ? `${status.ai.averageLatencyMs} ms` : "-"}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>OCR requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.ocrRequests ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>OCR avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.averageOcrLatencyMs != null ? `${status.ai.averageOcrLatencyMs} ms` : "-"}</Typography></Box>
</Box> </Box>
<Box sx={{ display: "flex", gap: 1, flexWrap: "wrap", mt: 2 }}> <Box sx={{ display: "flex", gap: 1, flexWrap: "wrap", mt: 2 }}>
<Chip label={status?.database.canConnect ? t("adminSystemDatabaseConnected") : t("adminSystemDatabaseIssue")} color={status?.database.canConnect ? "success" : "error"} size="small" /> <Chip label={status?.database.canConnect ? t("adminSystemDatabaseConnected") : t("adminSystemDatabaseIssue")} color={status?.database.canConnect ? "success" : "error"} size="small" />
<Chip label={status?.auth.required ? t("adminSystemAuthEnforced") : t("adminSystemAuthOptional")} color={status?.auth.required ? "success" : "warning"} size="small" /> <Chip label={status?.auth.required ? t("adminSystemAuthEnforced") : t("adminSystemAuthOptional")} color={status?.auth.required ? "success" : "warning"} size="small" />
<Chip label={status?.auth.googleConfigured ? t("adminSystemGoogleReady") : t("adminSystemGoogleOff")} variant="outlined" size="small" /> <Chip label={status?.auth.googleConfigured ? t("adminSystemGoogleReady") : t("adminSystemGoogleOff")} variant="outlined" size="small" />
<Chip label={status?.auth.gmailConfigured ? t("adminSystemGmailReady") : t("adminSystemGmailIncomplete")} variant="outlined" size="small" /> <Chip label={status?.auth.gmailConfigured ? t("adminSystemGmailReady") : t("adminSystemGmailIncomplete")} variant="outlined" size="small" />
<Chip label={status?.summarizer.gpuAvailable ? t("adminSystemGpuVisible") : t("adminSystemCpuMode")} color={status?.summarizer.gpuAvailable ? "success" : "default"} size="small" /> <Chip label={status?.ai.gpuAvailable ? t("adminSystemGpuVisible") : t("adminSystemCpuMode")} color={status?.ai.gpuAvailable ? "success" : "default"} size="small" />
<Chip label={status?.ai.ocrAvailable ? `OCR ${status.ai.ocrLanguages || "enabled"}` : "OCR unavailable"} variant="outlined" size="small" />
</Box> </Box>
</Paper> </Paper>
</Box> </Box>
+1 -1
View File
@@ -29,7 +29,7 @@ type MeResponse = {
} | null; } | null;
}; };
const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown"; const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,image/png,image/jpeg,image/webp,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown";
const AVATAR_UPLOAD_ACCEPT = "image/png,image/jpeg,image/webp"; const AVATAR_UPLOAD_ACCEPT = "image/png,image/jpeg,image/webp";
function initialsFrom(values: Array<string | undefined>) { function initialsFrom(values: Array<string | undefined>) {
+5
View File
@@ -9,6 +9,11 @@ jest.mock('./api', () => ({
delete: jest.fn(() => Promise.resolve({ data: {} })), delete: jest.fn(() => Promise.resolve({ data: {} })),
interceptors: { request: { use: jest.fn() }, response: { use: jest.fn() } }, interceptors: { request: { use: jest.fn() }, response: { use: jest.fn() } },
}, },
getApiErrorMessage: jest.fn((error: any, fallback?: string) => {
if (typeof error?.response?.data === 'string' && error.response.data.trim()) return error.response.data;
if (typeof error?.message === 'string' && error.message.trim()) return error.message;
return fallback || 'Request failed.';
}),
})); }));
jest.mock('./components/GoogleAuthCard', () => () => null); jest.mock('./components/GoogleAuthCard', () => () => null);
+3
View File
@@ -5,6 +5,9 @@ ENV PIP_NO_CACHE_DIR=1 \
TRANSFORMERS_NO_TF=1 \ TRANSFORMERS_NO_TF=1 \
HF_HUB_DISABLE_TELEMETRY=1 HF_HUB_DISABLE_TELEMETRY=1
WORKDIR /app WORKDIR /app
RUN apt-get update \
&& apt-get install -y --no-install-recommends tesseract-ocr tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt ./ COPY requirements.txt ./
RUN python -m pip install --upgrade pip setuptools wheel \ RUN python -m pip install --upgrade pip setuptools wheel \
&& python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt && python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
+22 -11
View File
@@ -1,16 +1,22 @@
# Local Hugging Face Summarizer # Local AI Service
This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API. This service runs a local Hugging Face summarization model and also exposes document text extraction with OCR for supported PDFs and images.
Install (recommended: virtualenv) ## Capabilities
- job/role summarization
- PDF text extraction
- OCR fallback for scanned PDFs
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
- DOCX / TXT / MD extraction
Windows (CPU PyTorch wheel may be required): ## Install
Windows:
```powershell ```powershell
python -m venv .venv python -m venv .venv
.\.venv\Scripts\Activate.ps1 .\.venv\Scripts\Activate.ps1
pip install -r requirements.txt pip install -r requirements.txt
# If torch wheel installation is needed, follow instructions at https://pytorch.org
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1 python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
``` ```
@@ -23,10 +29,15 @@ pip install -r requirements.txt
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1 python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
``` ```
API ## Docker
- `GET /health` — health check The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can be processed inside the container.
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }`
Notes ## API
- Model will be downloaded on first run and can be several hundred MB. - `GET /health` — health check and runtime capabilities
- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`. - `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
## Notes
- Model weights are downloaded on first run.
- OCR quality depends on scan quality and language support.
- Default OCR language is English (`eng`).
+107 -3
View File
@@ -1,16 +1,25 @@
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, File, HTTPException, UploadFile
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache from cachetools import TTLCache
from PIL import Image
from pypdf import PdfReader
from docx import Document
import fitz
import hashlib import hashlib
import io
import re import re
import torch import torch
import pytesseract
app = FastAPI(title="Local Summarizer") app = FastAPI(title="Local AI Service")
MODEL_NAME = "sshleifer/distilbart-cnn-12-6" MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000 MAX_INPUT_CHARS = 20000
MAX_CONTEXT_CHARS = 2200 MAX_CONTEXT_CHARS = 2200
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
OCR_LANGUAGES = "eng"
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
def _load_runtime(): def _load_runtime():
@@ -48,6 +57,8 @@ async def health():
"device": str(device), "device": str(device),
"gpu_available": GPU_AVAILABLE, "gpu_available": GPU_AVAILABLE,
"gpu_name": GPU_NAME, "gpu_name": GPU_NAME,
"ocr_available": True,
"ocr_languages": OCR_LANGUAGES,
} }
@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
"aws", "azure", "gcp", "terraform", "graphql", "rest", "git", "aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
] ]
_MUST_HAVE_HINTS = [ _MUST_HAVE_HINTS = [
"must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for", "must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
] ]
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
out = "\n".join(lines).strip() out = "\n".join(lines).strip()
cache[key] = out cache[key] = out
return {"summary": out, "cached": False} return {"summary": out, "cached": False}
def _normalize_text(value: str) -> str:
value = value.replace("\x00", " ")
return re.sub(r"\s+", " ", value).strip()
def _ocr_image(image: Image.Image) -> str:
if image.mode not in ("RGB", "L"):
image = image.convert("RGB")
text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
return _normalize_text(text)
def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
page_count = 0
extracted_pages = []
try:
reader = PdfReader(io.BytesIO(data))
page_count = len(reader.pages)
for page in reader.pages:
extracted_pages.append(page.extract_text() or "")
except Exception:
extracted_pages = []
text = _normalize_text("\n".join(extracted_pages))
if len(text) >= 80:
return text, False, page_count
doc = fitz.open(stream=data, filetype="pdf")
page_count = doc.page_count
ocr_pages = []
for page in doc:
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
image = Image.open(io.BytesIO(pix.tobytes("png")))
ocr_pages.append(_ocr_image(image))
doc.close()
return _normalize_text("\n".join(ocr_pages)), True, page_count
def _extract_docx_text(data: bytes) -> str:
document = Document(io.BytesIO(data))
parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
return _normalize_text("\n".join(parts))
def _extract_plain_text(data: bytes) -> str:
return _normalize_text(data.decode("utf-8", errors="ignore"))
@app.post("/extract-text")
async def extract_text(file: UploadFile = File(...)):
filename = file.filename or "document"
extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
data = await file.read()
if not data:
raise HTTPException(status_code=400, detail="The uploaded file was empty.")
if len(data) > MAX_EXTRACT_FILE_BYTES:
raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
try:
if extension in {".txt", ".md"}:
text = _extract_plain_text(data)
ocr_used = False
page_count = None
elif extension == ".docx":
text = _extract_docx_text(data)
ocr_used = False
page_count = None
elif extension == ".pdf":
text, ocr_used, page_count = _extract_pdf_text(data)
elif extension in IMAGE_EXTENSIONS:
image = Image.open(io.BytesIO(data))
text = _ocr_image(image)
ocr_used = True
page_count = 1
else:
raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
except HTTPException:
raise
except Exception as exc:
raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
if not text:
raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
return {
"text": text,
"ocr_used": ocr_used,
"content_type": file.content_type,
"page_count": page_count,
"characters": len(text),
"file_name": filename,
}
+5
View File
@@ -4,3 +4,8 @@ transformers==4.48.3
cachetools==5.5.2 cachetools==5.5.2
pydantic==2.10.6 pydantic==2.10.6
torch==2.6.0 torch==2.6.0
pillow==11.1.0
pytesseract==0.3.13
pypdf==5.4.0
pymupdf==1.25.5
python-docx==1.1.2