From 653f713a78e7375a335cbd4bc42dfc935cd26f3e Mon Sep 17 00:00:00 2001 From: cesnimda Date: Mon, 23 Mar 2026 20:12:34 +0100 Subject: [PATCH] Evolve summarizer into AI service with OCR support --- .env.example | 2 +- .gitea/workflows/ci-deploy.yml | 4 +- .../Controllers/AdminSystemController.cs | 7 +- .../Controllers/JobApplicationsController.cs | 3 +- .../Controllers/ProfileCvController.cs | 37 +++- JobTrackerApi/Program.cs | 8 +- JobTrackerApi/Services/SummarizerService.cs | 175 +++++++++++++----- README.md | 16 +- deploy/README.md | 6 +- docker-compose.yml | 5 +- docs/jobbjakt-cleanup-tracker.md | 12 +- job-tracker-ui/src/admin-system-page.test.tsx | 75 ++++++++ job-tracker-ui/src/i18n/translations.ts | 24 +-- job-tracker-ui/src/pages/AdminSystemPage.tsx | 72 +++---- job-tracker-ui/src/pages/ProfilePage.tsx | 2 +- job-tracker-ui/src/setupTests.ts | 5 + tools/summarizer/Dockerfile | 3 + tools/summarizer/README.md | 33 ++-- tools/summarizer/app.py | 110 ++++++++++- tools/summarizer/requirements.txt | 5 + 20 files changed, 475 insertions(+), 129 deletions(-) create mode 100644 job-tracker-ui/src/admin-system-page.test.tsx diff --git a/.env.example b/.env.example index 981637b..aaffc13 100644 --- a/.env.example +++ b/.env.example @@ -8,7 +8,7 @@ AUTH_GOOGLE_CLIENT_ID=CHANGE_ME_GOOGLE_CLIENT_ID GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET # Optional. If omitted, the backend uses https:///api/gmail/oauth/callback GOOGLE_GMAIL_REDIRECT_URI= -SUMMARIZER_BASE_URL=http://summarizer:8001 +AI_SERVICE_BASE_URL=http://ai-service:8001 # Optional: only needed if you want the UI to call a non-default API base URL. # In production the UI defaults to `/api`. diff --git a/.gitea/workflows/ci-deploy.yml b/.gitea/workflows/ci-deploy.yml index e8bbdb3..045ace2 100644 --- a/.gitea/workflows/ci-deploy.yml +++ b/.gitea/workflows/ci-deploy.yml @@ -37,7 +37,7 @@ jobs: - name: Test frontend working-directory: job-tracker-ui - run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx + run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx admin-system-page.test.tsx - name: Build frontend working-directory: job-tracker-ui @@ -76,7 +76,7 @@ jobs: APP_BUILD_STAMP="$(date -u +'%Y-%m-%d %H:%M UTC')" \ ./deploy/deploy.sh docker compose ps - docker compose exec -T summarizer python -c "import time, urllib.request; deadline=time.time()+60; last=None + docker compose exec -T ai-service python -c "import time, urllib.request; deadline=time.time()+60; last=None for _ in range(30): try: urllib.request.urlopen('http://127.0.0.1:8001/health', timeout=5).read() diff --git a/JobTrackerApi/Controllers/AdminSystemController.cs b/JobTrackerApi/Controllers/AdminSystemController.cs index 82c3279..eb932d5 100644 --- a/JobTrackerApi/Controllers/AdminSystemController.cs +++ b/JobTrackerApi/Controllers/AdminSystemController.cs @@ -44,7 +44,7 @@ public sealed class AdminSystemController : ControllerBase DatabaseStatusDto Database, RuntimeStatusDto Runtime, AuthStatusDto Auth, - SummarizerMetrics Summarizer + AiServiceMetrics Ai ); private static string? NormalizeBuildMetadata(string? value) @@ -62,6 +62,7 @@ public sealed class AdminSystemController : ControllerBase return trimmed; } + [HttpPost("ai/probe")] [HttpPost("summarizer/probe")] public async Task RunSummarizerProbe(CancellationToken cancellationToken) { @@ -79,7 +80,7 @@ public sealed class AdminSystemController : ControllerBase var jobs = await _db.JobApplications.AsNoTracking().ToListAsync(cancellationToken); var companies = await _db.Companies.AsNoTracking().CountAsync(cancellationToken); - var summarizer = await _summarizer.GetMetricsAsync(cancellationToken); + var ai = await _summarizer.GetMetricsAsync(cancellationToken); var version = NormalizeBuildMetadata(_cfg["App:Version"]); if (string.IsNullOrWhiteSpace(version)) @@ -180,7 +181,7 @@ public sealed class AdminSystemController : ControllerBase GoogleConfigured: !string.IsNullOrWhiteSpace((_cfg["Auth:GoogleClientId"] ?? string.Empty).Trim()), GmailConfigured: gmailConfigured ), - Summarizer: summarizer + Ai: ai )); } } diff --git a/JobTrackerApi/Controllers/JobApplicationsController.cs b/JobTrackerApi/Controllers/JobApplicationsController.cs index 97c6bb2..3b21178 100644 --- a/JobTrackerApi/Controllers/JobApplicationsController.cs +++ b/JobTrackerApi/Controllers/JobApplicationsController.cs @@ -1838,8 +1838,9 @@ Candidate master CV: return NoContent(); } + [HttpGet("ai-metrics")] [HttpGet("summarizer-metrics")] - public async Task> GetSummarizerMetrics(CancellationToken cancellationToken) + public async Task> GetSummarizerMetrics(CancellationToken cancellationToken) { var metrics = await _summarizer.GetMetricsAsync(cancellationToken); return Ok(metrics); diff --git a/JobTrackerApi/Controllers/ProfileCvController.cs b/JobTrackerApi/Controllers/ProfileCvController.cs index 674c755..5acbd5e 100644 --- a/JobTrackerApi/Controllers/ProfileCvController.cs +++ b/JobTrackerApi/Controllers/ProfileCvController.cs @@ -1,5 +1,6 @@ using System.Text; using System.Text.RegularExpressions; +using JobTrackerApi.Services; using JobTrackerApi.Models; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Identity; @@ -18,15 +19,21 @@ public sealed class ProfileCvController : ControllerBase ".md", ".pdf", ".docx", + ".png", + ".jpg", + ".jpeg", + ".webp", }; private const long MaxFileSizeBytes = 5 * 1024 * 1024; private readonly UserManager _users; + private readonly ISummarizerService _aiService; - public ProfileCvController(UserManager users) + public ProfileCvController(UserManager users, ISummarizerService aiService) { _users = users; + _aiService = aiService; } [HttpPost("upload")] @@ -41,10 +48,34 @@ public sealed class ProfileCvController : ControllerBase var extension = Path.GetExtension(file.FileName ?? string.Empty); if (!AllowedExtensions.Contains(extension)) { - return BadRequest("Only .txt, .md, .pdf, and .docx CV imports are supported right now."); + return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now."); } - var text = (await ExtractTextAsync(file, extension)).Trim(); + string text; + var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase) + || string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase) + || string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) + || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase) + || string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase) + || string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase) + || string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase) + || string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase); + + if (canUseAiExtraction) + { + await using var uploadStream = file.OpenReadStream(); + var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted); + text = extracted?.Text?.Trim() ?? string.Empty; + } + else + { + text = string.Empty; + } + + if (string.IsNullOrWhiteSpace(text)) + { + text = (await ExtractTextAsync(file, extension)).Trim(); + } if (string.IsNullOrWhiteSpace(text)) { return BadRequest("The uploaded CV file could not be read or was empty."); diff --git a/JobTrackerApi/Program.cs b/JobTrackerApi/Program.cs index 60a0fb8..b046fe5 100644 --- a/JobTrackerApi/Program.cs +++ b/JobTrackerApi/Program.cs @@ -116,10 +116,12 @@ builder.Services.AddHttpClient("jobimport") AutomaticDecompression = DecompressionMethods.All }); -// Local summarizer service (FastAPI). Default URL can be overridden via configuration `Summarizer:BaseUrl`. -builder.Services.AddHttpClient("summarizer", client => +// Local AI service (FastAPI). Supports summarization and OCR/text extraction. +builder.Services.AddHttpClient("ai-service", client => { - var baseUrl = builder.Configuration["Summarizer:BaseUrl"] ?? "http://127.0.0.1:8001"; + var baseUrl = builder.Configuration["Ai:BaseUrl"] + ?? builder.Configuration["Summarizer:BaseUrl"] + ?? "http://127.0.0.1:8001"; client.BaseAddress = new Uri(baseUrl); client.Timeout = TimeSpan.FromSeconds(30); }); diff --git a/JobTrackerApi/Services/SummarizerService.cs b/JobTrackerApi/Services/SummarizerService.cs index c72abe7..e9294e5 100644 --- a/JobTrackerApi/Services/SummarizerService.cs +++ b/JobTrackerApi/Services/SummarizerService.cs @@ -13,12 +13,14 @@ using Microsoft.Extensions.Logging; namespace JobTrackerApi.Services { - public sealed record SummarizerMetrics( + public sealed record AiServiceMetrics( bool Healthy, string? Model, string? Device, bool? GpuAvailable, string? GpuName, + bool? OcrAvailable, + string? OcrLanguages, double? HealthLatencyMs, double? ProbeLatencyMs, DateTimeOffset? LastProbeAt, @@ -30,17 +32,36 @@ namespace JobTrackerApi.Services int CacheMisses, int Failures, double? AverageLatencyMs, + int OcrRequests, + int OcrFailures, + double? AverageOcrLatencyMs, + DateTimeOffset? LastOcrSuccessAt, + DateTimeOffset? LastOcrFailureAt, DateTimeOffset? LastSuccessAt, DateTimeOffset? LastFailureAt, string? LastError ); - public interface ISummarizerService + public sealed record AiTextExtractionResult( + string? Text, + bool OcrUsed, + string? ContentType, + int? PageCount, + int Characters, + string? FileName + ); + + public interface IAiService { Task SummarizeAsync(string text, int maxLength = 150, int minLength = 30); Task SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40); + Task ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default); Task RunProbeAsync(CancellationToken cancellationToken = default); - Task GetMetricsAsync(CancellationToken cancellationToken = default); + Task GetMetricsAsync(CancellationToken cancellationToken = default); + } + + public interface ISummarizerService : IAiService + { } public class SummarizerService : ISummarizerService @@ -60,6 +81,11 @@ namespace JobTrackerApi.Services private DateTimeOffset? _lastProbeSuccessAt; private DateTimeOffset? _lastProbeFailureAt; private int _probeFailures; + private int _ocrRequests; + private int _ocrFailures; + private long _totalOcrLatencyTicks; + private DateTimeOffset? _lastOcrSuccessAt; + private DateTimeOffset? _lastOcrFailureAt; private string? _lastError; public SummarizerService(IHttpClientFactory httpFactory, IMemoryCache cache) @@ -78,22 +104,18 @@ namespace JobTrackerApi.Services public async Task SummarizeAsync(string text, int maxLength = 150, int minLength = 30) { if (string.IsNullOrWhiteSpace(text)) return null; - return await SummarizeCoreAsync(text, maxLength, minLength); } public Task SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40) { if (string.IsNullOrWhiteSpace(instruction) || string.IsNullOrWhiteSpace(text)) return Task.FromResult(null); - var composed = $"{instruction.Trim()}\n\n{text.Trim()}"; return SummarizeCoreAsync(composed, maxLength, minLength); } private async Task SummarizeCoreAsync(string text, int maxLength, int minLength) { - // Use a deterministic content hash instead of string.GetHashCode() so cache keys - // are collision-resistant and stable across process restarts. var key = BuildCacheKey(text, maxLength, minLength); Interlocked.Increment(ref _requests); @@ -110,7 +132,7 @@ namespace JobTrackerApi.Services Interlocked.Increment(ref _cacheMisses); - var client = _httpFactory.CreateClient("summarizer"); + var client = _httpFactory.CreateClient("ai-service"); var payload = JsonSerializer.Serialize(new { text, max_length = maxLength, min_length = minLength }); using var content = new StringContent(payload, Encoding.UTF8, "application/json"); var sw = Stopwatch.StartNew(); @@ -152,10 +174,74 @@ namespace JobTrackerApi.Services } } + public async Task ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(stream); + if (string.IsNullOrWhiteSpace(fileName)) fileName = "document"; + + Interlocked.Increment(ref _ocrRequests); + var client = _httpFactory.CreateClient("ai-service"); + var sw = Stopwatch.StartNew(); + + try + { + using var form = new MultipartFormDataContent(); + using var fileContent = new StreamContent(stream); + if (!string.IsNullOrWhiteSpace(contentType)) + { + fileContent.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue(contentType); + } + form.Add(fileContent, "file", fileName); + + using var response = await client.PostAsync("/extract-text", form, cancellationToken); + sw.Stop(); + Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks); + if (!response.IsSuccessStatusCode) + { + Interlocked.Increment(ref _ocrFailures); + lock (_metricsLock) + { + _lastOcrFailureAt = DateTimeOffset.UtcNow; + _lastError = $"AI extraction returned {(int)response.StatusCode}."; + } + return null; + } + + await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken); + using var doc = await JsonDocument.ParseAsync(responseStream, cancellationToken: cancellationToken); + var text = doc.RootElement.TryGetProperty("text", out var textEl) ? textEl.GetString() : null; + var ocrUsed = doc.RootElement.TryGetProperty("ocr_used", out var ocrEl) && ocrEl.ValueKind is JsonValueKind.True or JsonValueKind.False && ocrEl.GetBoolean(); + var detectedContentType = doc.RootElement.TryGetProperty("content_type", out var contentTypeEl) ? contentTypeEl.GetString() : contentType; + int? pageCount = doc.RootElement.TryGetProperty("page_count", out var pageCountEl) && pageCountEl.ValueKind == JsonValueKind.Number ? pageCountEl.GetInt32() : null; + var characters = doc.RootElement.TryGetProperty("characters", out var charactersEl) && charactersEl.ValueKind == JsonValueKind.Number ? charactersEl.GetInt32() : (text?.Length ?? 0); + var returnedFileName = doc.RootElement.TryGetProperty("file_name", out var fileNameEl) ? fileNameEl.GetString() : fileName; + + lock (_metricsLock) + { + _lastOcrSuccessAt = DateTimeOffset.UtcNow; + _lastError = null; + } + + return new AiTextExtractionResult(text, ocrUsed, detectedContentType, pageCount, characters, returnedFileName); + } + catch (Exception ex) + { + sw.Stop(); + Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks); + Interlocked.Increment(ref _ocrFailures); + lock (_metricsLock) + { + _lastOcrFailureAt = DateTimeOffset.UtcNow; + _lastError = ex.Message; + } + return null; + } + } + public async Task RunProbeAsync(CancellationToken cancellationToken = default) { - const string probeText = "Summarizer latency probe for job tracker telemetry."; - var client = _httpFactory.CreateClient("summarizer"); + const string probeText = "AI service latency probe for Jobbjakt telemetry."; + var client = _httpFactory.CreateClient("ai-service"); var payload = JsonSerializer.Serialize(new { text = probeText, max_length = 48, min_length = 12 }); using var content = new StringContent(payload, Encoding.UTF8, "application/json"); var sw = Stopwatch.StartNew(); @@ -215,13 +301,15 @@ namespace JobTrackerApi.Services } } - public async Task GetMetricsAsync(CancellationToken cancellationToken = default) + public async Task GetMetricsAsync(CancellationToken cancellationToken = default) { - var client = _httpFactory.CreateClient("summarizer"); + var client = _httpFactory.CreateClient("ai-service"); string? model = null; string? device = null; bool? gpuAvailable = null; string? gpuName = null; + bool? ocrAvailable = null; + string? ocrLanguages = null; double? healthLatencyMs = null; var healthy = false; string? healthError = null; @@ -238,25 +326,12 @@ namespace JobTrackerApi.Services { using var stream = await res.Content.ReadAsStreamAsync(cancellationToken); using var doc = await JsonDocument.ParseAsync(stream, cancellationToken: cancellationToken); - if (doc.RootElement.TryGetProperty("model", out var modelEl)) - { - model = modelEl.GetString(); - } - - if (doc.RootElement.TryGetProperty("device", out var deviceEl)) - { - device = deviceEl.GetString(); - } - - if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) - { - gpuAvailable = gpuAvailableEl.GetBoolean(); - } - - if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) - { - gpuName = gpuNameEl.GetString(); - } + if (doc.RootElement.TryGetProperty("model", out var modelEl)) model = modelEl.GetString(); + if (doc.RootElement.TryGetProperty("device", out var deviceEl)) device = deviceEl.GetString(); + if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) gpuAvailable = gpuAvailableEl.GetBoolean(); + if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString(); + if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean(); + if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString(); } else { @@ -273,6 +348,9 @@ namespace JobTrackerApi.Services var cacheMisses = Volatile.Read(ref _cacheMisses); var failures = Volatile.Read(ref _failures); var totalLatencyTicks = Volatile.Read(ref _totalLatencyTicks); + var ocrRequests = Volatile.Read(ref _ocrRequests); + var ocrFailures = Volatile.Read(ref _ocrFailures); + var totalOcrLatencyTicks = Volatile.Read(ref _totalOcrLatencyTicks); DateTimeOffset? lastSuccessAt; DateTimeOffset? lastFailureAt; @@ -280,6 +358,8 @@ namespace JobTrackerApi.Services DateTimeOffset? lastProbeAt; DateTimeOffset? lastProbeSuccessAt; DateTimeOffset? lastProbeFailureAt; + DateTimeOffset? lastOcrSuccessAt; + DateTimeOffset? lastOcrFailureAt; string? lastError; lock (_metricsLock) { @@ -289,6 +369,8 @@ namespace JobTrackerApi.Services lastProbeAt = _lastProbeAt; lastProbeSuccessAt = _lastProbeSuccessAt; lastProbeFailureAt = _lastProbeFailureAt; + lastOcrSuccessAt = _lastOcrSuccessAt; + lastOcrFailureAt = _lastOcrFailureAt; lastError = _lastError; } @@ -297,16 +379,17 @@ namespace JobTrackerApi.Services lastError = healthError; } - double? averageLatencyMs = requests > 0 - ? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1) - : null; + double? averageLatencyMs = requests > 0 ? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1) : null; + double? averageOcrLatencyMs = ocrRequests > 0 ? Math.Round(TimeSpan.FromTicks(totalOcrLatencyTicks).TotalMilliseconds / ocrRequests, 1) : null; - return new SummarizerMetrics( + return new AiServiceMetrics( Healthy: healthy, Model: model, Device: device, GpuAvailable: gpuAvailable, GpuName: gpuName, + OcrAvailable: ocrAvailable, + OcrLanguages: ocrLanguages, HealthLatencyMs: healthLatencyMs, ProbeLatencyMs: probeLatencyMs, LastProbeAt: lastProbeAt, @@ -318,6 +401,11 @@ namespace JobTrackerApi.Services CacheMisses: cacheMisses, Failures: failures, AverageLatencyMs: averageLatencyMs, + OcrRequests: ocrRequests, + OcrFailures: ocrFailures, + AverageOcrLatencyMs: averageOcrLatencyMs, + LastOcrSuccessAt: lastOcrSuccessAt, + LastOcrFailureAt: lastOcrFailureAt, LastSuccessAt: lastSuccessAt, LastFailureAt: lastFailureAt, LastError: lastError @@ -340,14 +428,11 @@ namespace JobTrackerApi.Services protected override async Task ExecuteAsync(CancellationToken stoppingToken) { - var enabled = _cfg.GetValue("Summarizer:ProbeEnabled", true); - if (!enabled) - { - return; - } + var enabled = _cfg.GetValue("Ai:ProbeEnabled", _cfg.GetValue("Summarizer:ProbeEnabled", true)); + if (!enabled) return; - var intervalSeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300), 30, 3600); - var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15), 0, 600); + var intervalSeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeIntervalSeconds", _cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300)), 30, 3600); + var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeInitialDelaySeconds", _cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15)), 0, 600); if (initialDelaySeconds > 0) { @@ -360,8 +445,8 @@ namespace JobTrackerApi.Services try { using var scope = _scopeFactory.CreateScope(); - var summarizer = scope.ServiceProvider.GetRequiredService(); - await summarizer.RunProbeAsync(stoppingToken); + var aiService = scope.ServiceProvider.GetRequiredService(); + await aiService.RunProbeAsync(stoppingToken); } catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) { @@ -369,7 +454,7 @@ namespace JobTrackerApi.Services } catch (Exception ex) { - _logger.LogWarning(ex, "Summarizer latency probe failed."); + _logger.LogWarning(ex, "AI service latency probe failed."); } } while (await timer.WaitForNextTickAsync(stoppingToken)); diff --git a/README.md b/README.md index 8c5aba7..976a939 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re - History/event trail per application (created, status changes, follow-up set, delete/restore) - Export jobs to JSON/CSV + daily scheduled JSON export - Optional “job import” preview from supported job sites (plugins) + optional translation to English -- Optional local summarizer service for short/full descriptions +- Optional local AI service for short/full descriptions - Optional Google sign-in (Google ID tokens) to protect the API ## Architecture @@ -21,11 +21,11 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re - `JobTrackerApi/`: ASP.NET Core API (defaults to `http://localhost:5202`) - SQLite DB file: defaults to `JobTrackerApi/jobtracker.db` unless `Data:Root` / connection string overrides it - Attachments: stored on disk under `DataRoot/Attachments//...` -- Optional local summarizer service: `tools/summarizer/` (FastAPI) used by the API via `Summarizer:BaseUrl` +- Optional local AI service: `tools/summarizer/` (FastAPI) used by the API via `Ai:BaseUrl` ## Quickstart (Docker) -This runs: frontend (nginx), backend API, and the summarizer service. +This runs: frontend (nginx), backend API, and the AI service. 1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`). @@ -43,7 +43,7 @@ docker compose up --build - .NET SDK `9.x` (API targets `net9.0`) - Node.js (for the UI) -- (Optional) Python 3.x if running the summarizer without Docker +- (Optional) Python 3.x if running the AI service without Docker ### 1) Run the API @@ -65,14 +65,14 @@ npm start The UI defaults to calling `http://localhost:5202/api` when running on localhost (see `job-tracker-ui/src/api.ts`). -### 3) (Optional) Run the summarizer +### 3) (Optional) Run the AI service The API calls a local FastAPI service to generate summaries. If it’s not running, the app still works (summary generation may be empty / best-effort). With Docker (recommended): ```bash -docker compose up --build summarizer +docker compose up --build ai-service ``` Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`). @@ -87,7 +87,7 @@ Common keys: - `Data:Root`: folder for the SQLite DB + exports (defaults to API content root) - `Data:AttachmentsRoot`: override attachments folder (defaults to `/Attachments`) - `Cors:Origins`: list of allowed origins (defaults to `http://localhost:3000`; use `"*"` to allow all) -- `Summarizer:BaseUrl`: summarizer base URL (default `http://127.0.0.1:8001`) +- `Ai:BaseUrl`: AI service base URL (default `http://127.0.0.1:8001`) - `Exports:DailyEnabled`: enable/disable daily export background job - `Exports:DailyFolder`: export destination (relative to `Data:Root` if not absolute) - `Exports:DailyHourLocal`: local hour (0–23) when the daily export runs @@ -109,7 +109,7 @@ Common keys: - `Email:SmtpUser`: SMTP username (often your Gmail address) - `Email:SmtpPassword`: SMTP password (for Gmail: use an App Password) - `Email:From`: from address (default: `Email:SmtpUser`) -- `Email:FromName`: from name (default: `Job Tracker`) +- `Email:FromName`: from name (default: `Jobbjakt`) ### UI settings diff --git a/deploy/README.md b/deploy/README.md index 2f2c271..cd187f3 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -51,7 +51,9 @@ AUTH_JWT_KEY=replace_with_long_random_secret AUTH_ADMIN_EMAIL=you@example.com AUTH_ADMIN_PASSWORD=replace_with_strong_password APP_PUBLIC_BASE_URL=https://your-domain.example -SUMMARIZER_BASE_URL=http://summarizer:8001 +AI_SERVICE_BASE_URL=http://ai-service:8001 +# Optional backward-compatible alias if older config still references the previous name: +SUMMARIZER_BASE_URL=http://ai-service:8001 ``` ## Database recommendation @@ -89,5 +91,5 @@ If this app is going to be a real production service on Ubuntu: - confirm reverse proxy routes to the frontend correctly - confirm API auth/login works with production config - confirm backend can connect to MariaDB -- confirm summarizer container is reachable from backend +- confirm AI service container is reachable from backend - confirm reminder and admin/system pages load diff --git a/docker-compose.yml b/docker-compose.yml index 57a6d89..4336c73 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,7 +23,8 @@ services: - Auth__GoogleClientId=${AUTH_GOOGLE_CLIENT_ID} - Google__GmailClientSecret=${GOOGLE_GMAIL_CLIENT_SECRET} - Google__GmailRedirectUri=${GOOGLE_GMAIL_REDIRECT_URI} - - Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://summarizer:8001} + - Ai__BaseUrl=${AI_SERVICE_BASE_URL:-http://ai-service:8001} + - Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://ai-service:8001} # Email (SMTP) # Build metadata should be resolved before deployment. Examples: # APP_VERSION=1.0.0 @@ -66,7 +67,7 @@ services: - shared_services restart: unless-stopped - summarizer: + ai-service: build: context: ./tools/summarizer dockerfile: Dockerfile diff --git a/docs/jobbjakt-cleanup-tracker.md b/docs/jobbjakt-cleanup-tracker.md index 4d85e32..6c401f6 100644 --- a/docs/jobbjakt-cleanup-tracker.md +++ b/docs/jobbjakt-cleanup-tracker.md @@ -2,6 +2,16 @@ Last updated: 2026-03-23 +## AI Service / OCR +- [x] Reframe user-facing "summarizer" status and docs toward an AI service +- [x] Add self-hosted OCR/text extraction endpoint to the local AI service +- [x] Add backend AI-service text extraction integration for profile CV uploads +- [x] Add OCR support for supported image CV uploads (`png`, `jpg`, `jpeg`, `webp`) +- [x] Add AI service latency/OCR telemetry to the system page +- [x] Add frontend test coverage for AI service status rendering +- [ ] Extend AI extraction to job attachment ingestion +- [ ] Consider full internal service/class rename from `Summarizer*` to `AiService*` + ## Build / UI Issues - [x] Fix visible build error text appearing on page load/footer - [x] Resolve naming inconsistency: `jobtrack` → `Jobbjakt` @@ -32,7 +42,7 @@ Last updated: 2026-03-23 - [x] Add zoom in/out support for image cropping - [x] Use square cropped avatar output - [x] Add CV upload support -- [ ] Verify/complete OCR/text extraction for uploaded CV PDFs +- [x] Verify/complete OCR/text extraction for uploaded CV PDFs ## Settings & System - [x] Restore missing follow-up days settings diff --git a/job-tracker-ui/src/admin-system-page.test.tsx b/job-tracker-ui/src/admin-system-page.test.tsx new file mode 100644 index 0000000..891a634 --- /dev/null +++ b/job-tracker-ui/src/admin-system-page.test.tsx @@ -0,0 +1,75 @@ +import React from 'react'; +import { render, screen, waitFor } from '@testing-library/react'; + +import AdminSystemPage from './pages/AdminSystemPage'; +import { I18nProvider } from './i18n/I18nProvider'; +import { api } from './api'; + +const mockedApi = api as jest.Mocked; + +describe('AdminSystemPage', () => { + it('renders AI service health, latency, and OCR readiness', async () => { + mockedApi.get.mockImplementation((url: string) => { + if (url === '/admin/system') { + return Promise.resolve({ + data: { + environment: 'Production', + contentRoot: '/app', + version: '1.2.3', + commitSha: 'abc1234', + buildStamp: '2026-03-23 11:00 UTC', + storage: { dataRoot: '/data', dbPath: '/data/jobtracker.db', dbExists: true, dbSizeBytes: 2048, companyCount: 3, jobCount: 7, deletedCount: 1 }, + email: { enabled: true, host: 'smtp.example.test', port: 587, enableSsl: true, from: 'noreply@example.test', fromName: 'Jobbjakt' }, + database: { provider: 'mariadb', looksConfigured: true, canConnect: true, target: 'server=db', usesFileStorage: false, warning: null }, + runtime: { framework: '.NET 9', osDescription: 'Linux', processArchitecture: 'X64', machineName: 'app-01' }, + auth: { required: true, hasJwtKey: true, googleConfigured: true, gmailConfigured: true }, + ai: { + healthy: true, + model: 'distilbart', + device: 'cpu', + gpuAvailable: false, + gpuName: null, + ocrAvailable: true, + ocrLanguages: 'eng', + healthLatencyMs: 12.4, + probeLatencyMs: 25.8, + lastProbeAt: '2026-03-23T10:00:00Z', + lastProbeSuccessAt: '2026-03-23T10:00:00Z', + lastProbeFailureAt: null, + probeFailures: 0, + requests: 18, + cacheHits: 9, + cacheMisses: 9, + failures: 0, + averageLatencyMs: 42.2, + ocrRequests: 5, + ocrFailures: 0, + averageOcrLatencyMs: 88.4, + lastOcrSuccessAt: '2026-03-23T10:05:00Z', + lastOcrFailureAt: null, + lastSuccessAt: '2026-03-23T10:04:00Z', + lastFailureAt: null, + lastError: null, + }, + }, + } as any); + } + return Promise.resolve({ data: {} } as any); + }); + + render( + + + , + ); + + await waitFor(() => { + expect(screen.getByText('AI service')).toBeTruthy(); + }); + + expect(screen.getByText(/25.8 ms probe/i)).toBeTruthy(); + expect(screen.getByText('OCR eng')).toBeTruthy(); + expect(screen.getByText('OCR avg latency')).toBeTruthy(); + expect(screen.getByText('88.4 ms')).toBeTruthy(); + }); +}); diff --git a/job-tracker-ui/src/i18n/translations.ts b/job-tracker-ui/src/i18n/translations.ts index 6d3f4ec..4f860e4 100644 --- a/job-tracker-ui/src/i18n/translations.ts +++ b/job-tracker-ui/src/i18n/translations.ts @@ -171,7 +171,7 @@ export const translations = { profileHeadline: "Profile headline", profileHeadlineHelp: "Stored only in this browser to personalize your workspace.", profileMasterCv: "Master CV", - profileMasterCvBody: "Upload a PDF, DOCX, plain text file, or markdown file. The app extracts text where supported and populates your master CV text for tailoring and outreach.", + profileMasterCvBody: "Upload a PDF, DOCX, plain text file, markdown file, or image scan. The AI service extracts text where possible and falls back to OCR for supported scanned files.", profileUploadCv: "Upload CV", profileUploading: "Uploading...", profileCopyCvText: "Copy CV text", @@ -179,7 +179,7 @@ export const translations = { profileCvUploadFailed: "Failed to upload CV.", profileCvTextLabel: "Profile CV / master resume text", profileCvTextHelp: "Keep this updated and specific. Include recent roles, tools, achievements, measurable outcomes, and the work you want to be hired for next. If extraction misses something, edit it here manually.", - profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD.", + profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.", profileSaveChanges: "Save changes", profileUpdated: "Profile updated.", profileUpdateFailed: "Failed to update profile.", @@ -272,7 +272,7 @@ export const translations = { adminUsersCreated: "User created.", adminUsersCreateFailed: "Failed to create user.", adminSystemTitle: "System status", - adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, and summarizer health.", + adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, AI service health, and OCR readiness.", adminSystemRunProbe: "Run probe now", adminSystemRunningProbe: "Running probe...", adminSystemRefresh: "Refresh", @@ -284,13 +284,13 @@ export const translations = { adminSystemSmtp: "SMTP", adminSystemEnabled: "Enabled", adminSystemDisabled: "Disabled", - adminSystemSummarizer: "Summarizer", + adminSystemSummarizer: "AI service", adminSystemHealthy: "Healthy", adminSystemNoLatencyData: "No latency data", adminSystemDatabaseStorage: "Database and storage", adminSystemRuntimeAuth: "Runtime and auth", adminSystemEmailConfig: "Email configuration", - adminSystemSummarizerRuntime: "Summarizer runtime", + adminSystemSummarizerRuntime: "AI runtime", adminSystemSmtpTest: "SMTP test email", adminSystemSmtpTestBody: "Send a quick delivery check using the configured SMTP settings. Leave the recipient blank to use your admin email.", adminSystemRecipientEmail: "Recipient email", @@ -299,7 +299,7 @@ export const translations = { adminSystemMessage: "Message", adminSystemSendTestEmail: "Send test email", adminSystemSending: "Sending...", - adminSystemSummarizerTelemetry: "Summarizer telemetry", + adminSystemSummarizerTelemetry: "AI service telemetry", adminSystemDatabaseConnected: "Database connected", adminSystemDatabaseIssue: "Database issue", adminSystemAuthEnforced: "Auth enforced", @@ -591,7 +591,7 @@ export const translations = { profileHeadline: "Profiloverskrift", profileHeadlineHelp: "Lagres bare i denne nettleseren for å gjøre arbeidsområdet mer personlig.", profileMasterCv: "Hoved-CV", - profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil eller markdown-fil. Appen henter ut tekst der det støttes og fyller inn hoved-CV-en din for tilpasning og kontakt.", + profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil, markdown-fil eller et bildeskann. AI-tjenesten henter ut tekst der det er mulig og faller tilbake til OCR for støttede skannede filer.", profileUploadCv: "Last opp CV", profileUploading: "Laster opp...", profileCopyCvText: "Kopier CV-tekst", @@ -599,7 +599,7 @@ export const translations = { profileCvUploadFailed: "Kunne ikke laste opp CV.", profileCvTextLabel: "Profil-CV / hovedtekst for CV", profileCvTextHelp: "Hold denne oppdatert og konkret. Ta med nylige roller, verktøy, prestasjoner, målbare resultater og arbeidet du vil bli ansatt for neste gang. Hvis tekstuttrekket mangler noe, kan du redigere manuelt her.", - profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD.", + profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.", profileSaveChanges: "Lagre endringer", profileUpdated: "Profil oppdatert.", profileUpdateFailed: "Kunne ikke oppdatere profil.", @@ -692,7 +692,7 @@ export const translations = { adminUsersCreated: "Bruker opprettet.", adminUsersCreateFailed: "Kunne ikke opprette bruker.", adminSystemTitle: "Systemstatus", - adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post og oppsummeringshelse.", + adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post, AI-tjenestehelse og OCR-beredskap.", adminSystemRunProbe: "Kjør probe nå", adminSystemRunningProbe: "Kjører probe...", adminSystemRefresh: "Oppdater", @@ -704,13 +704,13 @@ export const translations = { adminSystemSmtp: "SMTP", adminSystemEnabled: "Aktivert", adminSystemDisabled: "Deaktivert", - adminSystemSummarizer: "Oppsummerer", + adminSystemSummarizer: "AI-tjeneste", adminSystemHealthy: "Frisk", adminSystemNoLatencyData: "Ingen latensdata", adminSystemDatabaseStorage: "Database og lagring", adminSystemRuntimeAuth: "Kjøretid og autentisering", adminSystemEmailConfig: "E-postkonfigurasjon", - adminSystemSummarizerRuntime: "Oppsummeringskjøretid", + adminSystemSummarizerRuntime: "AI-kjøretid", adminSystemSmtpTest: "SMTP-test e-post", adminSystemSmtpTestBody: "Send en rask leveringssjekk med de konfigurerte SMTP-innstillingene. La mottakeren stå tom for å bruke admin-eposten din.", adminSystemRecipientEmail: "Mottaker e-post", @@ -719,7 +719,7 @@ export const translations = { adminSystemMessage: "Melding", adminSystemSendTestEmail: "Send test-e-post", adminSystemSending: "Sender...", - adminSystemSummarizerTelemetry: "Oppsummeringstelemetri", + adminSystemSummarizerTelemetry: "AI-tjenestetelemetri", adminSystemDatabaseConnected: "Database tilkoblet", adminSystemDatabaseIssue: "Databaseproblem", adminSystemAuthEnforced: "Autentisering påkrevd", diff --git a/job-tracker-ui/src/pages/AdminSystemPage.tsx b/job-tracker-ui/src/pages/AdminSystemPage.tsx index 68fd011..d8c7052 100644 --- a/job-tracker-ui/src/pages/AdminSystemPage.tsx +++ b/job-tracker-ui/src/pages/AdminSystemPage.tsx @@ -14,12 +14,14 @@ import { import { api, getApiErrorMessage } from "../api"; import { useI18n } from "../i18n/I18nProvider"; -type SummarizerMetrics = { +type AiServiceMetrics = { healthy: boolean; model?: string | null; device?: string | null; gpuAvailable?: boolean; gpuName?: string | null; + ocrAvailable?: boolean | null; + ocrLanguages?: string | null; healthLatencyMs?: number | null; probeLatencyMs?: number | null; lastProbeAt?: string | null; @@ -31,6 +33,11 @@ type SummarizerMetrics = { cacheMisses: number; failures: number; averageLatencyMs?: number | null; + ocrRequests: number; + ocrFailures: number; + averageOcrLatencyMs?: number | null; + lastOcrSuccessAt?: string | null; + lastOcrFailureAt?: string | null; lastSuccessAt?: string | null; lastFailureAt?: string | null; lastError?: string | null; @@ -79,7 +86,7 @@ type SystemStatus = { googleConfigured: boolean; gmailConfigured: boolean; }; - summarizer: SummarizerMetrics; + ai: AiServiceMetrics; }; function formatBytes(bytes?: number | null) { @@ -148,10 +155,10 @@ export default function AdminSystemPage() { return "success" as const; }, [status]); - const summarizerTone = useMemo(() => { + const aiTone = useMemo(() => { if (!status) return "default" as const; - if (!status.summarizer.healthy) return "error" as const; - if (status.summarizer.probeFailures > 0 || status.summarizer.failures > 0) return "warning" as const; + if (!status.ai.healthy) return "error" as const; + if (status.ai.probeFailures > 0 || status.ai.failures > 0 || (status.ai.ocrFailures ?? 0) > 0) return "warning" as const; return "success" as const; }, [status]); @@ -184,10 +191,10 @@ export default function AdminSystemPage() { setRunningProbe(true); setError(null); try { - await api.post("/admin/system/summarizer/probe"); + await api.post("/admin/system/ai/probe"); await load(); } catch (e: any) { - setError(getApiErrorMessage(e, "Failed to run summarizer probe.")); + setError(getApiErrorMessage(e, "Failed to run AI service probe.")); } finally { setRunningProbe(false); } @@ -204,7 +211,7 @@ export default function AdminSystemPage() { {error ? {error} : null} {status?.database.warning ? {status.database.warning} : null} - {status?.summarizer.lastError ? {status.summarizer.lastError} : null} + {status?.ai.lastError ? {status.ai.lastError} : null} @@ -288,15 +295,15 @@ export default function AdminSystemPage() { {t("adminSystemSummarizerRuntime")} - - - - - - - - - + + + + + + + + + @@ -320,20 +327,23 @@ export default function AdminSystemPage() { {t("adminSystemSummarizerTelemetry")} - - Requests{status?.summarizer.requests ?? 0} - Cache hits{status?.summarizer.cacheHits ?? 0} - Cache misses{status?.summarizer.cacheMisses ?? 0} - Failures{status?.summarizer.failures ?? 0} - Probe failures{status?.summarizer.probeFailures ?? 0} - Avg latency{status?.summarizer.averageLatencyMs != null ? `${status.summarizer.averageLatencyMs} ms` : "-"} + + Requests{status?.ai.requests ?? 0} + Cache hits{status?.ai.cacheHits ?? 0} + Cache misses{status?.ai.cacheMisses ?? 0} + Failures{status?.ai.failures ?? 0} + Probe failures{status?.ai.probeFailures ?? 0} + Avg latency{status?.ai.averageLatencyMs != null ? `${status.ai.averageLatencyMs} ms` : "-"} + OCR requests{status?.ai.ocrRequests ?? 0} + OCR avg latency{status?.ai.averageOcrLatencyMs != null ? `${status.ai.averageOcrLatencyMs} ms` : "-"} - + + diff --git a/job-tracker-ui/src/pages/ProfilePage.tsx b/job-tracker-ui/src/pages/ProfilePage.tsx index 8eb6397..9a0e1a8 100644 --- a/job-tracker-ui/src/pages/ProfilePage.tsx +++ b/job-tracker-ui/src/pages/ProfilePage.tsx @@ -29,7 +29,7 @@ type MeResponse = { } | null; }; -const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown"; +const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,image/png,image/jpeg,image/webp,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown"; const AVATAR_UPLOAD_ACCEPT = "image/png,image/jpeg,image/webp"; function initialsFrom(values: Array) { diff --git a/job-tracker-ui/src/setupTests.ts b/job-tracker-ui/src/setupTests.ts index 0c520f7..8609961 100644 --- a/job-tracker-ui/src/setupTests.ts +++ b/job-tracker-ui/src/setupTests.ts @@ -9,6 +9,11 @@ jest.mock('./api', () => ({ delete: jest.fn(() => Promise.resolve({ data: {} })), interceptors: { request: { use: jest.fn() }, response: { use: jest.fn() } }, }, + getApiErrorMessage: jest.fn((error: any, fallback?: string) => { + if (typeof error?.response?.data === 'string' && error.response.data.trim()) return error.response.data; + if (typeof error?.message === 'string' && error.message.trim()) return error.message; + return fallback || 'Request failed.'; + }), })); jest.mock('./components/GoogleAuthCard', () => () => null); diff --git a/tools/summarizer/Dockerfile b/tools/summarizer/Dockerfile index f3f3a6e..8e57b27 100644 --- a/tools/summarizer/Dockerfile +++ b/tools/summarizer/Dockerfile @@ -5,6 +5,9 @@ ENV PIP_NO_CACHE_DIR=1 \ TRANSFORMERS_NO_TF=1 \ HF_HUB_DISABLE_TELEMETRY=1 WORKDIR /app +RUN apt-get update \ + && apt-get install -y --no-install-recommends tesseract-ocr tesseract-ocr-eng \ + && rm -rf /var/lib/apt/lists/* COPY requirements.txt ./ RUN python -m pip install --upgrade pip setuptools wheel \ && python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt diff --git a/tools/summarizer/README.md b/tools/summarizer/README.md index 5e0ce7f..8b26dc2 100644 --- a/tools/summarizer/README.md +++ b/tools/summarizer/README.md @@ -1,16 +1,22 @@ -# Local Hugging Face Summarizer +# Local AI Service -This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API. +This service runs a local Hugging Face summarization model and also exposes document text extraction with OCR for supported PDFs and images. -Install (recommended: virtualenv) +## Capabilities +- job/role summarization +- PDF text extraction +- OCR fallback for scanned PDFs +- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`) +- DOCX / TXT / MD extraction -Windows (CPU PyTorch wheel may be required): +## Install + +Windows: ```powershell python -m venv .venv .\.venv\Scripts\Activate.ps1 pip install -r requirements.txt -# If torch wheel installation is needed, follow instructions at https://pytorch.org python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1 ``` @@ -23,10 +29,15 @@ pip install -r requirements.txt python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1 ``` -API -- `GET /health` — health check -- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }` +## Docker +The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can be processed inside the container. -Notes -- Model will be downloaded on first run and can be several hundred MB. -- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`. +## API +- `GET /health` — health check and runtime capabilities +- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` +- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata + +## Notes +- Model weights are downloaded on first run. +- OCR quality depends on scan quality and language support. +- Default OCR language is English (`eng`). diff --git a/tools/summarizer/app.py b/tools/summarizer/app.py index fe907a5..da0b264 100644 --- a/tools/summarizer/app.py +++ b/tools/summarizer/app.py @@ -1,16 +1,25 @@ -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, File, HTTPException, UploadFile from pydantic import BaseModel, Field from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from cachetools import TTLCache +from PIL import Image +from pypdf import PdfReader +from docx import Document +import fitz import hashlib +import io import re import torch +import pytesseract -app = FastAPI(title="Local Summarizer") +app = FastAPI(title="Local AI Service") MODEL_NAME = "sshleifer/distilbart-cnn-12-6" MAX_INPUT_CHARS = 20000 MAX_CONTEXT_CHARS = 2200 +MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024 +OCR_LANGUAGES = "eng" +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"} def _load_runtime(): @@ -48,6 +57,8 @@ async def health(): "device": str(device), "gpu_available": GPU_AVAILABLE, "gpu_name": GPU_NAME, + "ocr_available": True, + "ocr_languages": OCR_LANGUAGES, } @@ -68,7 +79,6 @@ _TECH_PRIORITY = [ "aws", "azure", "gcp", "terraform", "graphql", "rest", "git", ] - _MUST_HAVE_HINTS = [ "must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for", ] @@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest): out = "\n".join(lines).strip() cache[key] = out return {"summary": out, "cached": False} + + +def _normalize_text(value: str) -> str: + value = value.replace("\x00", " ") + return re.sub(r"\s+", " ", value).strip() + + +def _ocr_image(image: Image.Image) -> str: + if image.mode not in ("RGB", "L"): + image = image.convert("RGB") + text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES) + return _normalize_text(text) + + +def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]: + page_count = 0 + extracted_pages = [] + try: + reader = PdfReader(io.BytesIO(data)) + page_count = len(reader.pages) + for page in reader.pages: + extracted_pages.append(page.extract_text() or "") + except Exception: + extracted_pages = [] + + text = _normalize_text("\n".join(extracted_pages)) + if len(text) >= 80: + return text, False, page_count + + doc = fitz.open(stream=data, filetype="pdf") + page_count = doc.page_count + ocr_pages = [] + for page in doc: + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False) + image = Image.open(io.BytesIO(pix.tobytes("png"))) + ocr_pages.append(_ocr_image(image)) + doc.close() + return _normalize_text("\n".join(ocr_pages)), True, page_count + + +def _extract_docx_text(data: bytes) -> str: + document = Document(io.BytesIO(data)) + parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()] + return _normalize_text("\n".join(parts)) + + +def _extract_plain_text(data: bytes) -> str: + return _normalize_text(data.decode("utf-8", errors="ignore")) + + +@app.post("/extract-text") +async def extract_text(file: UploadFile = File(...)): + filename = file.filename or "document" + extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else "" + data = await file.read() + if not data: + raise HTTPException(status_code=400, detail="The uploaded file was empty.") + if len(data) > MAX_EXTRACT_FILE_BYTES: + raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.") + + try: + if extension in {".txt", ".md"}: + text = _extract_plain_text(data) + ocr_used = False + page_count = None + elif extension == ".docx": + text = _extract_docx_text(data) + ocr_used = False + page_count = None + elif extension == ".pdf": + text, ocr_used, page_count = _extract_pdf_text(data) + elif extension in IMAGE_EXTENSIONS: + image = Image.open(io.BytesIO(data)) + text = _ocr_image(image) + ocr_used = True + page_count = 1 + else: + raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.") + except HTTPException: + raise + except Exception as exc: + raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc + + if not text: + raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.") + + return { + "text": text, + "ocr_used": ocr_used, + "content_type": file.content_type, + "page_count": page_count, + "characters": len(text), + "file_name": filename, + } diff --git a/tools/summarizer/requirements.txt b/tools/summarizer/requirements.txt index f11de3e..00fdce0 100644 --- a/tools/summarizer/requirements.txt +++ b/tools/summarizer/requirements.txt @@ -4,3 +4,8 @@ transformers==4.48.3 cachetools==5.5.2 pydantic==2.10.6 torch==2.6.0 +pillow==11.1.0 +pytesseract==0.3.13 +pypdf==5.4.0 +pymupdf==1.25.5 +python-docx==1.1.2