Evolve summarizer into AI service with OCR support

This commit is contained in:
cesnimda
2026-03-23 20:12:34 +01:00
parent 90fdd8e1a5
commit 653f713a78
20 changed files with 475 additions and 129 deletions
+1 -1
View File
@@ -8,7 +8,7 @@ AUTH_GOOGLE_CLIENT_ID=CHANGE_ME_GOOGLE_CLIENT_ID
GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET
# Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback
GOOGLE_GMAIL_REDIRECT_URI=
SUMMARIZER_BASE_URL=http://summarizer:8001
AI_SERVICE_BASE_URL=http://ai-service:8001
# Optional: only needed if you want the UI to call a non-default API base URL.
# In production the UI defaults to `/api`.
+2 -2
View File
@@ -37,7 +37,7 @@ jobs:
- name: Test frontend
working-directory: job-tracker-ui
run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx
run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx admin-system-page.test.tsx
- name: Build frontend
working-directory: job-tracker-ui
@@ -76,7 +76,7 @@ jobs:
APP_BUILD_STAMP="$(date -u +'%Y-%m-%d %H:%M UTC')" \
./deploy/deploy.sh
docker compose ps
docker compose exec -T summarizer python -c "import time, urllib.request; deadline=time.time()+60; last=None
docker compose exec -T ai-service python -c "import time, urllib.request; deadline=time.time()+60; last=None
for _ in range(30):
try:
urllib.request.urlopen('http://127.0.0.1:8001/health', timeout=5).read()
@@ -44,7 +44,7 @@ public sealed class AdminSystemController : ControllerBase
DatabaseStatusDto Database,
RuntimeStatusDto Runtime,
AuthStatusDto Auth,
SummarizerMetrics Summarizer
AiServiceMetrics Ai
);
private static string? NormalizeBuildMetadata(string? value)
@@ -62,6 +62,7 @@ public sealed class AdminSystemController : ControllerBase
return trimmed;
}
[HttpPost("ai/probe")]
[HttpPost("summarizer/probe")]
public async Task<IActionResult> RunSummarizerProbe(CancellationToken cancellationToken)
{
@@ -79,7 +80,7 @@ public sealed class AdminSystemController : ControllerBase
var jobs = await _db.JobApplications.AsNoTracking().ToListAsync(cancellationToken);
var companies = await _db.Companies.AsNoTracking().CountAsync(cancellationToken);
var summarizer = await _summarizer.GetMetricsAsync(cancellationToken);
var ai = await _summarizer.GetMetricsAsync(cancellationToken);
var version = NormalizeBuildMetadata(_cfg["App:Version"]);
if (string.IsNullOrWhiteSpace(version))
@@ -180,7 +181,7 @@ public sealed class AdminSystemController : ControllerBase
GoogleConfigured: !string.IsNullOrWhiteSpace((_cfg["Auth:GoogleClientId"] ?? string.Empty).Trim()),
GmailConfigured: gmailConfigured
),
Summarizer: summarizer
Ai: ai
));
}
}
@@ -1838,8 +1838,9 @@ Candidate master CV:
return NoContent();
}
[HttpGet("ai-metrics")]
[HttpGet("summarizer-metrics")]
public async Task<ActionResult<SummarizerMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken)
public async Task<ActionResult<AiServiceMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken)
{
var metrics = await _summarizer.GetMetricsAsync(cancellationToken);
return Ok(metrics);
@@ -1,5 +1,6 @@
using System.Text;
using System.Text.RegularExpressions;
using JobTrackerApi.Services;
using JobTrackerApi.Models;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Identity;
@@ -18,15 +19,21 @@ public sealed class ProfileCvController : ControllerBase
".md",
".pdf",
".docx",
".png",
".jpg",
".jpeg",
".webp",
};
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
public ProfileCvController(UserManager<ApplicationUser> users)
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService)
{
_users = users;
_aiService = aiService;
}
[HttpPost("upload")]
@@ -41,10 +48,34 @@ public sealed class ProfileCvController : ControllerBase
var extension = Path.GetExtension(file.FileName ?? string.Empty);
if (!AllowedExtensions.Contains(extension))
{
return BadRequest("Only .txt, .md, .pdf, and .docx CV imports are supported right now.");
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
}
var text = (await ExtractTextAsync(file, extension)).Trim();
string text;
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
if (canUseAiExtraction)
{
await using var uploadStream = file.OpenReadStream();
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted);
text = extracted?.Text?.Trim() ?? string.Empty;
}
else
{
text = string.Empty;
}
if (string.IsNullOrWhiteSpace(text))
{
text = (await ExtractTextAsync(file, extension)).Trim();
}
if (string.IsNullOrWhiteSpace(text))
{
return BadRequest("The uploaded CV file could not be read or was empty.");
+5 -3
View File
@@ -116,10 +116,12 @@ builder.Services.AddHttpClient("jobimport")
AutomaticDecompression = DecompressionMethods.All
});
// Local summarizer service (FastAPI). Default URL can be overridden via configuration `Summarizer:BaseUrl`.
builder.Services.AddHttpClient("summarizer", client =>
// Local AI service (FastAPI). Supports summarization and OCR/text extraction.
builder.Services.AddHttpClient("ai-service", client =>
{
var baseUrl = builder.Configuration["Summarizer:BaseUrl"] ?? "http://127.0.0.1:8001";
var baseUrl = builder.Configuration["Ai:BaseUrl"]
?? builder.Configuration["Summarizer:BaseUrl"]
?? "http://127.0.0.1:8001";
client.BaseAddress = new Uri(baseUrl);
client.Timeout = TimeSpan.FromSeconds(30);
});
+130 -45
View File
@@ -13,12 +13,14 @@ using Microsoft.Extensions.Logging;
namespace JobTrackerApi.Services
{
public sealed record SummarizerMetrics(
public sealed record AiServiceMetrics(
bool Healthy,
string? Model,
string? Device,
bool? GpuAvailable,
string? GpuName,
bool? OcrAvailable,
string? OcrLanguages,
double? HealthLatencyMs,
double? ProbeLatencyMs,
DateTimeOffset? LastProbeAt,
@@ -30,17 +32,36 @@ namespace JobTrackerApi.Services
int CacheMisses,
int Failures,
double? AverageLatencyMs,
int OcrRequests,
int OcrFailures,
double? AverageOcrLatencyMs,
DateTimeOffset? LastOcrSuccessAt,
DateTimeOffset? LastOcrFailureAt,
DateTimeOffset? LastSuccessAt,
DateTimeOffset? LastFailureAt,
string? LastError
);
public interface ISummarizerService
public sealed record AiTextExtractionResult(
string? Text,
bool OcrUsed,
string? ContentType,
int? PageCount,
int Characters,
string? FileName
);
public interface IAiService
{
Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30);
Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40);
Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default);
Task RunProbeAsync(CancellationToken cancellationToken = default);
Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default);
Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default);
}
public interface ISummarizerService : IAiService
{
}
public class SummarizerService : ISummarizerService
@@ -60,6 +81,11 @@ namespace JobTrackerApi.Services
private DateTimeOffset? _lastProbeSuccessAt;
private DateTimeOffset? _lastProbeFailureAt;
private int _probeFailures;
private int _ocrRequests;
private int _ocrFailures;
private long _totalOcrLatencyTicks;
private DateTimeOffset? _lastOcrSuccessAt;
private DateTimeOffset? _lastOcrFailureAt;
private string? _lastError;
public SummarizerService(IHttpClientFactory httpFactory, IMemoryCache cache)
@@ -78,22 +104,18 @@ namespace JobTrackerApi.Services
public async Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30)
{
if (string.IsNullOrWhiteSpace(text)) return null;
return await SummarizeCoreAsync(text, maxLength, minLength);
}
public Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40)
{
if (string.IsNullOrWhiteSpace(instruction) || string.IsNullOrWhiteSpace(text)) return Task.FromResult<string?>(null);
var composed = $"{instruction.Trim()}\n\n{text.Trim()}";
return SummarizeCoreAsync(composed, maxLength, minLength);
}
private async Task<string?> SummarizeCoreAsync(string text, int maxLength, int minLength)
{
// Use a deterministic content hash instead of string.GetHashCode() so cache keys
// are collision-resistant and stable across process restarts.
var key = BuildCacheKey(text, maxLength, minLength);
Interlocked.Increment(ref _requests);
@@ -110,7 +132,7 @@ namespace JobTrackerApi.Services
Interlocked.Increment(ref _cacheMisses);
var client = _httpFactory.CreateClient("summarizer");
var client = _httpFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { text, max_length = maxLength, min_length = minLength });
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
var sw = Stopwatch.StartNew();
@@ -152,10 +174,74 @@ namespace JobTrackerApi.Services
}
}
public async Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(stream);
if (string.IsNullOrWhiteSpace(fileName)) fileName = "document";
Interlocked.Increment(ref _ocrRequests);
var client = _httpFactory.CreateClient("ai-service");
var sw = Stopwatch.StartNew();
try
{
using var form = new MultipartFormDataContent();
using var fileContent = new StreamContent(stream);
if (!string.IsNullOrWhiteSpace(contentType))
{
fileContent.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue(contentType);
}
form.Add(fileContent, "file", fileName);
using var response = await client.PostAsync("/extract-text", form, cancellationToken);
sw.Stop();
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
if (!response.IsSuccessStatusCode)
{
Interlocked.Increment(ref _ocrFailures);
lock (_metricsLock)
{
_lastOcrFailureAt = DateTimeOffset.UtcNow;
_lastError = $"AI extraction returned {(int)response.StatusCode}.";
}
return null;
}
await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken);
using var doc = await JsonDocument.ParseAsync(responseStream, cancellationToken: cancellationToken);
var text = doc.RootElement.TryGetProperty("text", out var textEl) ? textEl.GetString() : null;
var ocrUsed = doc.RootElement.TryGetProperty("ocr_used", out var ocrEl) && ocrEl.ValueKind is JsonValueKind.True or JsonValueKind.False && ocrEl.GetBoolean();
var detectedContentType = doc.RootElement.TryGetProperty("content_type", out var contentTypeEl) ? contentTypeEl.GetString() : contentType;
int? pageCount = doc.RootElement.TryGetProperty("page_count", out var pageCountEl) && pageCountEl.ValueKind == JsonValueKind.Number ? pageCountEl.GetInt32() : null;
var characters = doc.RootElement.TryGetProperty("characters", out var charactersEl) && charactersEl.ValueKind == JsonValueKind.Number ? charactersEl.GetInt32() : (text?.Length ?? 0);
var returnedFileName = doc.RootElement.TryGetProperty("file_name", out var fileNameEl) ? fileNameEl.GetString() : fileName;
lock (_metricsLock)
{
_lastOcrSuccessAt = DateTimeOffset.UtcNow;
_lastError = null;
}
return new AiTextExtractionResult(text, ocrUsed, detectedContentType, pageCount, characters, returnedFileName);
}
catch (Exception ex)
{
sw.Stop();
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
Interlocked.Increment(ref _ocrFailures);
lock (_metricsLock)
{
_lastOcrFailureAt = DateTimeOffset.UtcNow;
_lastError = ex.Message;
}
return null;
}
}
public async Task RunProbeAsync(CancellationToken cancellationToken = default)
{
const string probeText = "Summarizer latency probe for job tracker telemetry.";
var client = _httpFactory.CreateClient("summarizer");
const string probeText = "AI service latency probe for Jobbjakt telemetry.";
var client = _httpFactory.CreateClient("ai-service");
var payload = JsonSerializer.Serialize(new { text = probeText, max_length = 48, min_length = 12 });
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
var sw = Stopwatch.StartNew();
@@ -215,13 +301,15 @@ namespace JobTrackerApi.Services
}
}
public async Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default)
public async Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default)
{
var client = _httpFactory.CreateClient("summarizer");
var client = _httpFactory.CreateClient("ai-service");
string? model = null;
string? device = null;
bool? gpuAvailable = null;
string? gpuName = null;
bool? ocrAvailable = null;
string? ocrLanguages = null;
double? healthLatencyMs = null;
var healthy = false;
string? healthError = null;
@@ -238,25 +326,12 @@ namespace JobTrackerApi.Services
{
using var stream = await res.Content.ReadAsStreamAsync(cancellationToken);
using var doc = await JsonDocument.ParseAsync(stream, cancellationToken: cancellationToken);
if (doc.RootElement.TryGetProperty("model", out var modelEl))
{
model = modelEl.GetString();
}
if (doc.RootElement.TryGetProperty("device", out var deviceEl))
{
device = deviceEl.GetString();
}
if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False)
{
gpuAvailable = gpuAvailableEl.GetBoolean();
}
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl))
{
gpuName = gpuNameEl.GetString();
}
if (doc.RootElement.TryGetProperty("model", out var modelEl)) model = modelEl.GetString();
if (doc.RootElement.TryGetProperty("device", out var deviceEl)) device = deviceEl.GetString();
if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) gpuAvailable = gpuAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
}
else
{
@@ -273,6 +348,9 @@ namespace JobTrackerApi.Services
var cacheMisses = Volatile.Read(ref _cacheMisses);
var failures = Volatile.Read(ref _failures);
var totalLatencyTicks = Volatile.Read(ref _totalLatencyTicks);
var ocrRequests = Volatile.Read(ref _ocrRequests);
var ocrFailures = Volatile.Read(ref _ocrFailures);
var totalOcrLatencyTicks = Volatile.Read(ref _totalOcrLatencyTicks);
DateTimeOffset? lastSuccessAt;
DateTimeOffset? lastFailureAt;
@@ -280,6 +358,8 @@ namespace JobTrackerApi.Services
DateTimeOffset? lastProbeAt;
DateTimeOffset? lastProbeSuccessAt;
DateTimeOffset? lastProbeFailureAt;
DateTimeOffset? lastOcrSuccessAt;
DateTimeOffset? lastOcrFailureAt;
string? lastError;
lock (_metricsLock)
{
@@ -289,6 +369,8 @@ namespace JobTrackerApi.Services
lastProbeAt = _lastProbeAt;
lastProbeSuccessAt = _lastProbeSuccessAt;
lastProbeFailureAt = _lastProbeFailureAt;
lastOcrSuccessAt = _lastOcrSuccessAt;
lastOcrFailureAt = _lastOcrFailureAt;
lastError = _lastError;
}
@@ -297,16 +379,17 @@ namespace JobTrackerApi.Services
lastError = healthError;
}
double? averageLatencyMs = requests > 0
? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1)
: null;
double? averageLatencyMs = requests > 0 ? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1) : null;
double? averageOcrLatencyMs = ocrRequests > 0 ? Math.Round(TimeSpan.FromTicks(totalOcrLatencyTicks).TotalMilliseconds / ocrRequests, 1) : null;
return new SummarizerMetrics(
return new AiServiceMetrics(
Healthy: healthy,
Model: model,
Device: device,
GpuAvailable: gpuAvailable,
GpuName: gpuName,
OcrAvailable: ocrAvailable,
OcrLanguages: ocrLanguages,
HealthLatencyMs: healthLatencyMs,
ProbeLatencyMs: probeLatencyMs,
LastProbeAt: lastProbeAt,
@@ -318,6 +401,11 @@ namespace JobTrackerApi.Services
CacheMisses: cacheMisses,
Failures: failures,
AverageLatencyMs: averageLatencyMs,
OcrRequests: ocrRequests,
OcrFailures: ocrFailures,
AverageOcrLatencyMs: averageOcrLatencyMs,
LastOcrSuccessAt: lastOcrSuccessAt,
LastOcrFailureAt: lastOcrFailureAt,
LastSuccessAt: lastSuccessAt,
LastFailureAt: lastFailureAt,
LastError: lastError
@@ -340,14 +428,11 @@ namespace JobTrackerApi.Services
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var enabled = _cfg.GetValue("Summarizer:ProbeEnabled", true);
if (!enabled)
{
return;
}
var enabled = _cfg.GetValue("Ai:ProbeEnabled", _cfg.GetValue("Summarizer:ProbeEnabled", true));
if (!enabled) return;
var intervalSeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300), 30, 3600);
var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15), 0, 600);
var intervalSeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeIntervalSeconds", _cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300)), 30, 3600);
var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeInitialDelaySeconds", _cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15)), 0, 600);
if (initialDelaySeconds > 0)
{
@@ -360,8 +445,8 @@ namespace JobTrackerApi.Services
try
{
using var scope = _scopeFactory.CreateScope();
var summarizer = scope.ServiceProvider.GetRequiredService<ISummarizerService>();
await summarizer.RunProbeAsync(stoppingToken);
var aiService = scope.ServiceProvider.GetRequiredService<ISummarizerService>();
await aiService.RunProbeAsync(stoppingToken);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
@@ -369,7 +454,7 @@ namespace JobTrackerApi.Services
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Summarizer latency probe failed.");
_logger.LogWarning(ex, "AI service latency probe failed.");
}
}
while (await timer.WaitForNextTickAsync(stoppingToken));
+8 -8
View File
@@ -12,7 +12,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
- History/event trail per application (created, status changes, follow-up set, delete/restore)
- Export jobs to JSON/CSV + daily scheduled JSON export
- Optional “job import” preview from supported job sites (plugins) + optional translation to English
- Optional local summarizer service for short/full descriptions
- Optional local AI service for short/full descriptions
- Optional Google sign-in (Google ID tokens) to protect the API
## Architecture
@@ -21,11 +21,11 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
- `JobTrackerApi/`: ASP.NET Core API (defaults to `http://localhost:5202`)
- SQLite DB file: defaults to `JobTrackerApi/jobtracker.db` unless `Data:Root` / connection string overrides it
- Attachments: stored on disk under `DataRoot/Attachments/<jobId>/...`
- Optional local summarizer service: `tools/summarizer/` (FastAPI) used by the API via `Summarizer:BaseUrl`
- Optional local AI service: `tools/summarizer/` (FastAPI) used by the API via `Ai:BaseUrl`
## Quickstart (Docker)
This runs: frontend (nginx), backend API, and the summarizer service.
This runs: frontend (nginx), backend API, and the AI service.
1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`).
@@ -43,7 +43,7 @@ docker compose up --build
- .NET SDK `9.x` (API targets `net9.0`)
- Node.js (for the UI)
- (Optional) Python 3.x if running the summarizer without Docker
- (Optional) Python 3.x if running the AI service without Docker
### 1) Run the API
@@ -65,14 +65,14 @@ npm start
The UI defaults to calling `http://localhost:5202/api` when running on localhost (see `job-tracker-ui/src/api.ts`).
### 3) (Optional) Run the summarizer
### 3) (Optional) Run the AI service
The API calls a local FastAPI service to generate summaries. If its not running, the app still works (summary generation may be empty / best-effort).
With Docker (recommended):
```bash
docker compose up --build summarizer
docker compose up --build ai-service
```
Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`).
@@ -87,7 +87,7 @@ Common keys:
- `Data:Root`: folder for the SQLite DB + exports (defaults to API content root)
- `Data:AttachmentsRoot`: override attachments folder (defaults to `<Data:Root>/Attachments`)
- `Cors:Origins`: list of allowed origins (defaults to `http://localhost:3000`; use `"*"` to allow all)
- `Summarizer:BaseUrl`: summarizer base URL (default `http://127.0.0.1:8001`)
- `Ai:BaseUrl`: AI service base URL (default `http://127.0.0.1:8001`)
- `Exports:DailyEnabled`: enable/disable daily export background job
- `Exports:DailyFolder`: export destination (relative to `Data:Root` if not absolute)
- `Exports:DailyHourLocal`: local hour (023) when the daily export runs
@@ -109,7 +109,7 @@ Common keys:
- `Email:SmtpUser`: SMTP username (often your Gmail address)
- `Email:SmtpPassword`: SMTP password (for Gmail: use an App Password)
- `Email:From`: from address (default: `Email:SmtpUser`)
- `Email:FromName`: from name (default: `Job Tracker`)
- `Email:FromName`: from name (default: `Jobbjakt`)
### UI settings
+4 -2
View File
@@ -51,7 +51,9 @@ AUTH_JWT_KEY=replace_with_long_random_secret
AUTH_ADMIN_EMAIL=you@example.com
AUTH_ADMIN_PASSWORD=replace_with_strong_password
APP_PUBLIC_BASE_URL=https://your-domain.example
SUMMARIZER_BASE_URL=http://summarizer:8001
AI_SERVICE_BASE_URL=http://ai-service:8001
# Optional backward-compatible alias if older config still references the previous name:
SUMMARIZER_BASE_URL=http://ai-service:8001
```
## Database recommendation
@@ -89,5 +91,5 @@ If this app is going to be a real production service on Ubuntu:
- confirm reverse proxy routes to the frontend correctly
- confirm API auth/login works with production config
- confirm backend can connect to MariaDB
- confirm summarizer container is reachable from backend
- confirm AI service container is reachable from backend
- confirm reminder and admin/system pages load
+3 -2
View File
@@ -23,7 +23,8 @@ services:
- Auth__GoogleClientId=${AUTH_GOOGLE_CLIENT_ID}
- Google__GmailClientSecret=${GOOGLE_GMAIL_CLIENT_SECRET}
- Google__GmailRedirectUri=${GOOGLE_GMAIL_REDIRECT_URI}
- Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://summarizer:8001}
- Ai__BaseUrl=${AI_SERVICE_BASE_URL:-http://ai-service:8001}
- Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://ai-service:8001}
# Email (SMTP)
# Build metadata should be resolved before deployment. Examples:
# APP_VERSION=1.0.0
@@ -66,7 +67,7 @@ services:
- shared_services
restart: unless-stopped
summarizer:
ai-service:
build:
context: ./tools/summarizer
dockerfile: Dockerfile
+11 -1
View File
@@ -2,6 +2,16 @@
Last updated: 2026-03-23
## AI Service / OCR
- [x] Reframe user-facing "summarizer" status and docs toward an AI service
- [x] Add self-hosted OCR/text extraction endpoint to the local AI service
- [x] Add backend AI-service text extraction integration for profile CV uploads
- [x] Add OCR support for supported image CV uploads (`png`, `jpg`, `jpeg`, `webp`)
- [x] Add AI service latency/OCR telemetry to the system page
- [x] Add frontend test coverage for AI service status rendering
- [ ] Extend AI extraction to job attachment ingestion
- [ ] Consider full internal service/class rename from `Summarizer*` to `AiService*`
## Build / UI Issues
- [x] Fix visible build error text appearing on page load/footer
- [x] Resolve naming inconsistency: `jobtrack``Jobbjakt`
@@ -32,7 +42,7 @@ Last updated: 2026-03-23
- [x] Add zoom in/out support for image cropping
- [x] Use square cropped avatar output
- [x] Add CV upload support
- [ ] Verify/complete OCR/text extraction for uploaded CV PDFs
- [x] Verify/complete OCR/text extraction for uploaded CV PDFs
## Settings & System
- [x] Restore missing follow-up days settings
@@ -0,0 +1,75 @@
import React from 'react';
import { render, screen, waitFor } from '@testing-library/react';
import AdminSystemPage from './pages/AdminSystemPage';
import { I18nProvider } from './i18n/I18nProvider';
import { api } from './api';
const mockedApi = api as jest.Mocked<typeof api>;
describe('AdminSystemPage', () => {
it('renders AI service health, latency, and OCR readiness', async () => {
mockedApi.get.mockImplementation((url: string) => {
if (url === '/admin/system') {
return Promise.resolve({
data: {
environment: 'Production',
contentRoot: '/app',
version: '1.2.3',
commitSha: 'abc1234',
buildStamp: '2026-03-23 11:00 UTC',
storage: { dataRoot: '/data', dbPath: '/data/jobtracker.db', dbExists: true, dbSizeBytes: 2048, companyCount: 3, jobCount: 7, deletedCount: 1 },
email: { enabled: true, host: 'smtp.example.test', port: 587, enableSsl: true, from: 'noreply@example.test', fromName: 'Jobbjakt' },
database: { provider: 'mariadb', looksConfigured: true, canConnect: true, target: 'server=db', usesFileStorage: false, warning: null },
runtime: { framework: '.NET 9', osDescription: 'Linux', processArchitecture: 'X64', machineName: 'app-01' },
auth: { required: true, hasJwtKey: true, googleConfigured: true, gmailConfigured: true },
ai: {
healthy: true,
model: 'distilbart',
device: 'cpu',
gpuAvailable: false,
gpuName: null,
ocrAvailable: true,
ocrLanguages: 'eng',
healthLatencyMs: 12.4,
probeLatencyMs: 25.8,
lastProbeAt: '2026-03-23T10:00:00Z',
lastProbeSuccessAt: '2026-03-23T10:00:00Z',
lastProbeFailureAt: null,
probeFailures: 0,
requests: 18,
cacheHits: 9,
cacheMisses: 9,
failures: 0,
averageLatencyMs: 42.2,
ocrRequests: 5,
ocrFailures: 0,
averageOcrLatencyMs: 88.4,
lastOcrSuccessAt: '2026-03-23T10:05:00Z',
lastOcrFailureAt: null,
lastSuccessAt: '2026-03-23T10:04:00Z',
lastFailureAt: null,
lastError: null,
},
},
} as any);
}
return Promise.resolve({ data: {} } as any);
});
render(
<I18nProvider>
<AdminSystemPage />
</I18nProvider>,
);
await waitFor(() => {
expect(screen.getByText('AI service')).toBeTruthy();
});
expect(screen.getByText(/25.8 ms probe/i)).toBeTruthy();
expect(screen.getByText('OCR eng')).toBeTruthy();
expect(screen.getByText('OCR avg latency')).toBeTruthy();
expect(screen.getByText('88.4 ms')).toBeTruthy();
});
});
+12 -12
View File
@@ -171,7 +171,7 @@ export const translations = {
profileHeadline: "Profile headline",
profileHeadlineHelp: "Stored only in this browser to personalize your workspace.",
profileMasterCv: "Master CV",
profileMasterCvBody: "Upload a PDF, DOCX, plain text file, or markdown file. The app extracts text where supported and populates your master CV text for tailoring and outreach.",
profileMasterCvBody: "Upload a PDF, DOCX, plain text file, markdown file, or image scan. The AI service extracts text where possible and falls back to OCR for supported scanned files.",
profileUploadCv: "Upload CV",
profileUploading: "Uploading...",
profileCopyCvText: "Copy CV text",
@@ -179,7 +179,7 @@ export const translations = {
profileCvUploadFailed: "Failed to upload CV.",
profileCvTextLabel: "Profile CV / master resume text",
profileCvTextHelp: "Keep this updated and specific. Include recent roles, tools, achievements, measurable outcomes, and the work you want to be hired for next. If extraction misses something, edit it here manually.",
profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD.",
profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.",
profileSaveChanges: "Save changes",
profileUpdated: "Profile updated.",
profileUpdateFailed: "Failed to update profile.",
@@ -272,7 +272,7 @@ export const translations = {
adminUsersCreated: "User created.",
adminUsersCreateFailed: "Failed to create user.",
adminSystemTitle: "System status",
adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, and summarizer health.",
adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, AI service health, and OCR readiness.",
adminSystemRunProbe: "Run probe now",
adminSystemRunningProbe: "Running probe...",
adminSystemRefresh: "Refresh",
@@ -284,13 +284,13 @@ export const translations = {
adminSystemSmtp: "SMTP",
adminSystemEnabled: "Enabled",
adminSystemDisabled: "Disabled",
adminSystemSummarizer: "Summarizer",
adminSystemSummarizer: "AI service",
adminSystemHealthy: "Healthy",
adminSystemNoLatencyData: "No latency data",
adminSystemDatabaseStorage: "Database and storage",
adminSystemRuntimeAuth: "Runtime and auth",
adminSystemEmailConfig: "Email configuration",
adminSystemSummarizerRuntime: "Summarizer runtime",
adminSystemSummarizerRuntime: "AI runtime",
adminSystemSmtpTest: "SMTP test email",
adminSystemSmtpTestBody: "Send a quick delivery check using the configured SMTP settings. Leave the recipient blank to use your admin email.",
adminSystemRecipientEmail: "Recipient email",
@@ -299,7 +299,7 @@ export const translations = {
adminSystemMessage: "Message",
adminSystemSendTestEmail: "Send test email",
adminSystemSending: "Sending...",
adminSystemSummarizerTelemetry: "Summarizer telemetry",
adminSystemSummarizerTelemetry: "AI service telemetry",
adminSystemDatabaseConnected: "Database connected",
adminSystemDatabaseIssue: "Database issue",
adminSystemAuthEnforced: "Auth enforced",
@@ -591,7 +591,7 @@ export const translations = {
profileHeadline: "Profiloverskrift",
profileHeadlineHelp: "Lagres bare i denne nettleseren for å gjøre arbeidsområdet mer personlig.",
profileMasterCv: "Hoved-CV",
profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil eller markdown-fil. Appen henter ut tekst der det støttes og fyller inn hoved-CV-en din for tilpasning og kontakt.",
profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil, markdown-fil eller et bildeskann. AI-tjenesten henter ut tekst der det er mulig og faller tilbake til OCR for støttede skannede filer.",
profileUploadCv: "Last opp CV",
profileUploading: "Laster opp...",
profileCopyCvText: "Kopier CV-tekst",
@@ -599,7 +599,7 @@ export const translations = {
profileCvUploadFailed: "Kunne ikke laste opp CV.",
profileCvTextLabel: "Profil-CV / hovedtekst for CV",
profileCvTextHelp: "Hold denne oppdatert og konkret. Ta med nylige roller, verktøy, prestasjoner, målbare resultater og arbeidet du vil bli ansatt for neste gang. Hvis tekstuttrekket mangler noe, kan du redigere manuelt her.",
profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD.",
profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.",
profileSaveChanges: "Lagre endringer",
profileUpdated: "Profil oppdatert.",
profileUpdateFailed: "Kunne ikke oppdatere profil.",
@@ -692,7 +692,7 @@ export const translations = {
adminUsersCreated: "Bruker opprettet.",
adminUsersCreateFailed: "Kunne ikke opprette bruker.",
adminSystemTitle: "Systemstatus",
adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post og oppsummeringshelse.",
adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post, AI-tjenestehelse og OCR-beredskap.",
adminSystemRunProbe: "Kjør probe nå",
adminSystemRunningProbe: "Kjører probe...",
adminSystemRefresh: "Oppdater",
@@ -704,13 +704,13 @@ export const translations = {
adminSystemSmtp: "SMTP",
adminSystemEnabled: "Aktivert",
adminSystemDisabled: "Deaktivert",
adminSystemSummarizer: "Oppsummerer",
adminSystemSummarizer: "AI-tjeneste",
adminSystemHealthy: "Frisk",
adminSystemNoLatencyData: "Ingen latensdata",
adminSystemDatabaseStorage: "Database og lagring",
adminSystemRuntimeAuth: "Kjøretid og autentisering",
adminSystemEmailConfig: "E-postkonfigurasjon",
adminSystemSummarizerRuntime: "Oppsummeringskjøretid",
adminSystemSummarizerRuntime: "AI-kjøretid",
adminSystemSmtpTest: "SMTP-test e-post",
adminSystemSmtpTestBody: "Send en rask leveringssjekk med de konfigurerte SMTP-innstillingene. La mottakeren stå tom for å bruke admin-eposten din.",
adminSystemRecipientEmail: "Mottaker e-post",
@@ -719,7 +719,7 @@ export const translations = {
adminSystemMessage: "Melding",
adminSystemSendTestEmail: "Send test-e-post",
adminSystemSending: "Sender...",
adminSystemSummarizerTelemetry: "Oppsummeringstelemetri",
adminSystemSummarizerTelemetry: "AI-tjenestetelemetri",
adminSystemDatabaseConnected: "Database tilkoblet",
adminSystemDatabaseIssue: "Databaseproblem",
adminSystemAuthEnforced: "Autentisering påkrevd",
+41 -31
View File
@@ -14,12 +14,14 @@ import {
import { api, getApiErrorMessage } from "../api";
import { useI18n } from "../i18n/I18nProvider";
type SummarizerMetrics = {
type AiServiceMetrics = {
healthy: boolean;
model?: string | null;
device?: string | null;
gpuAvailable?: boolean;
gpuName?: string | null;
ocrAvailable?: boolean | null;
ocrLanguages?: string | null;
healthLatencyMs?: number | null;
probeLatencyMs?: number | null;
lastProbeAt?: string | null;
@@ -31,6 +33,11 @@ type SummarizerMetrics = {
cacheMisses: number;
failures: number;
averageLatencyMs?: number | null;
ocrRequests: number;
ocrFailures: number;
averageOcrLatencyMs?: number | null;
lastOcrSuccessAt?: string | null;
lastOcrFailureAt?: string | null;
lastSuccessAt?: string | null;
lastFailureAt?: string | null;
lastError?: string | null;
@@ -79,7 +86,7 @@ type SystemStatus = {
googleConfigured: boolean;
gmailConfigured: boolean;
};
summarizer: SummarizerMetrics;
ai: AiServiceMetrics;
};
function formatBytes(bytes?: number | null) {
@@ -148,10 +155,10 @@ export default function AdminSystemPage() {
return "success" as const;
}, [status]);
const summarizerTone = useMemo(() => {
const aiTone = useMemo(() => {
if (!status) return "default" as const;
if (!status.summarizer.healthy) return "error" as const;
if (status.summarizer.probeFailures > 0 || status.summarizer.failures > 0) return "warning" as const;
if (!status.ai.healthy) return "error" as const;
if (status.ai.probeFailures > 0 || status.ai.failures > 0 || (status.ai.ocrFailures ?? 0) > 0) return "warning" as const;
return "success" as const;
}, [status]);
@@ -184,10 +191,10 @@ export default function AdminSystemPage() {
setRunningProbe(true);
setError(null);
try {
await api.post("/admin/system/summarizer/probe");
await api.post("/admin/system/ai/probe");
await load();
} catch (e: any) {
setError(getApiErrorMessage(e, "Failed to run summarizer probe."));
setError(getApiErrorMessage(e, "Failed to run AI service probe."));
} finally {
setRunningProbe(false);
}
@@ -204,7 +211,7 @@ export default function AdminSystemPage() {
{error ? <Alert severity="error">{error}</Alert> : null}
{status?.database.warning ? <Alert severity={status.database.canConnect ? "warning" : "error"}>{status.database.warning}</Alert> : null}
{status?.summarizer.lastError ? <Alert severity={status.summarizer.healthy ? "warning" : "error"}>{status.summarizer.lastError}</Alert> : null}
{status?.ai.lastError ? <Alert severity={status.ai.healthy ? "warning" : "error"}>{status.ai.lastError}</Alert> : null}
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr", md: "repeat(4, 1fr)" }, gap: 2 }}>
<SummaryCard
@@ -226,13 +233,13 @@ export default function AdminSystemPage() {
/>
<SummaryCard
title={t("adminSystemSummarizer")}
value={status?.summarizer.healthy ? t("adminSystemHealthy") : t("adminSystemOffline")}
subtitle={status?.summarizer.probeLatencyMs != null
? `${status.summarizer.probeLatencyMs} ms probe · ${status.summarizer.device || "unknown device"}`
: status?.summarizer.healthLatencyMs != null
? `${status.summarizer.healthLatencyMs} ms health · ${status.summarizer.device || "unknown device"}`
value={status?.ai.healthy ? t("adminSystemHealthy") : t("adminSystemOffline")}
subtitle={status?.ai.probeLatencyMs != null
? `${status.ai.probeLatencyMs} ms probe · ${status.ai.device || "unknown device"}`
: status?.ai.healthLatencyMs != null
? `${status.ai.healthLatencyMs} ms health · ${status.ai.device || "unknown device"}`
: t("adminSystemNoLatencyData")}
tone={summarizerTone}
tone={aiTone}
/>
</Box>
@@ -288,15 +295,15 @@ export default function AdminSystemPage() {
<Paper sx={{ p: 2, borderRadius: 3 }}>
<Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerRuntime")}</Typography>
<Stack spacing={0.75}>
<DetailRow label="Model" value={status?.summarizer.model || "-"} />
<DetailRow label="Device" value={status?.summarizer.device || "-"} />
<DetailRow label="GPU available" value={status?.summarizer.gpuAvailable ? "Yes" : "No"} />
<DetailRow label="GPU name" value={status?.summarizer.gpuName || "-"} />
<DetailRow label="Health latency" value={status?.summarizer.healthLatencyMs != null ? `${status.summarizer.healthLatencyMs} ms` : "-"} />
<DetailRow label="Probe latency" value={status?.summarizer.probeLatencyMs != null ? `${status.summarizer.probeLatencyMs} ms` : "-"} />
<DetailRow label="Last probe" value={formatDate(status?.summarizer.lastProbeAt)} />
<DetailRow label="Last successful probe" value={formatDate(status?.summarizer.lastProbeSuccessAt)} />
<DetailRow label="Last summarization success" value={formatDate(status?.summarizer.lastSuccessAt)} />
<DetailRow label="Model" value={status?.ai.model || "-"} />
<DetailRow label="Device" value={status?.ai.device || "-"} />
<DetailRow label="GPU available" value={status?.ai.gpuAvailable ? "Yes" : "No"} />
<DetailRow label="GPU name" value={status?.ai.gpuName || "-"} />
<DetailRow label="Health latency" value={status?.ai.healthLatencyMs != null ? `${status.ai.healthLatencyMs} ms` : "-"} />
<DetailRow label="Probe latency" value={status?.ai.probeLatencyMs != null ? `${status.ai.probeLatencyMs} ms` : "-"} />
<DetailRow label="Last probe" value={formatDate(status?.ai.lastProbeAt)} />
<DetailRow label="Last successful probe" value={formatDate(status?.ai.lastProbeSuccessAt)} />
<DetailRow label="Last summarization success" value={formatDate(status?.ai.lastSuccessAt)} />
</Stack>
</Paper>
</Box>
@@ -320,20 +327,23 @@ export default function AdminSystemPage() {
<Paper sx={{ p: 2, borderRadius: 3 }}>
<Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerTelemetry")}</Typography>
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr 1fr", md: "repeat(6, 1fr)" }, gap: 2 }}>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.requests ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache hits</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.cacheHits ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache misses</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.cacheMisses ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.failures ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Probe failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.probeFailures ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.averageLatencyMs != null ? `${status.summarizer.averageLatencyMs} ms` : "-"}</Typography></Box>
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr 1fr", md: "repeat(8, 1fr)" }, gap: 2 }}>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.requests ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache hits</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.cacheHits ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache misses</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.cacheMisses ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.failures ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Probe failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.probeFailures ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.averageLatencyMs != null ? `${status.ai.averageLatencyMs} ms` : "-"}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>OCR requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.ocrRequests ?? 0}</Typography></Box>
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>OCR avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.averageOcrLatencyMs != null ? `${status.ai.averageOcrLatencyMs} ms` : "-"}</Typography></Box>
</Box>
<Box sx={{ display: "flex", gap: 1, flexWrap: "wrap", mt: 2 }}>
<Chip label={status?.database.canConnect ? t("adminSystemDatabaseConnected") : t("adminSystemDatabaseIssue")} color={status?.database.canConnect ? "success" : "error"} size="small" />
<Chip label={status?.auth.required ? t("adminSystemAuthEnforced") : t("adminSystemAuthOptional")} color={status?.auth.required ? "success" : "warning"} size="small" />
<Chip label={status?.auth.googleConfigured ? t("adminSystemGoogleReady") : t("adminSystemGoogleOff")} variant="outlined" size="small" />
<Chip label={status?.auth.gmailConfigured ? t("adminSystemGmailReady") : t("adminSystemGmailIncomplete")} variant="outlined" size="small" />
<Chip label={status?.summarizer.gpuAvailable ? t("adminSystemGpuVisible") : t("adminSystemCpuMode")} color={status?.summarizer.gpuAvailable ? "success" : "default"} size="small" />
<Chip label={status?.ai.gpuAvailable ? t("adminSystemGpuVisible") : t("adminSystemCpuMode")} color={status?.ai.gpuAvailable ? "success" : "default"} size="small" />
<Chip label={status?.ai.ocrAvailable ? `OCR ${status.ai.ocrLanguages || "enabled"}` : "OCR unavailable"} variant="outlined" size="small" />
</Box>
</Paper>
</Box>
+1 -1
View File
@@ -29,7 +29,7 @@ type MeResponse = {
} | null;
};
const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown";
const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,image/png,image/jpeg,image/webp,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown";
const AVATAR_UPLOAD_ACCEPT = "image/png,image/jpeg,image/webp";
function initialsFrom(values: Array<string | undefined>) {
+5
View File
@@ -9,6 +9,11 @@ jest.mock('./api', () => ({
delete: jest.fn(() => Promise.resolve({ data: {} })),
interceptors: { request: { use: jest.fn() }, response: { use: jest.fn() } },
},
getApiErrorMessage: jest.fn((error: any, fallback?: string) => {
if (typeof error?.response?.data === 'string' && error.response.data.trim()) return error.response.data;
if (typeof error?.message === 'string' && error.message.trim()) return error.message;
return fallback || 'Request failed.';
}),
}));
jest.mock('./components/GoogleAuthCard', () => () => null);
+3
View File
@@ -5,6 +5,9 @@ ENV PIP_NO_CACHE_DIR=1 \
TRANSFORMERS_NO_TF=1 \
HF_HUB_DISABLE_TELEMETRY=1
WORKDIR /app
RUN apt-get update \
&& apt-get install -y --no-install-recommends tesseract-ocr tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt ./
RUN python -m pip install --upgrade pip setuptools wheel \
&& python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
+22 -11
View File
@@ -1,16 +1,22 @@
# Local Hugging Face Summarizer
# Local AI Service
This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API.
This service runs a local Hugging Face summarization model and also exposes document text extraction with OCR for supported PDFs and images.
Install (recommended: virtualenv)
## Capabilities
- job/role summarization
- PDF text extraction
- OCR fallback for scanned PDFs
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
- DOCX / TXT / MD extraction
Windows (CPU PyTorch wheel may be required):
## Install
Windows:
```powershell
python -m venv .venv
.\.venv\Scripts\Activate.ps1
pip install -r requirements.txt
# If torch wheel installation is needed, follow instructions at https://pytorch.org
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
```
@@ -23,10 +29,15 @@ pip install -r requirements.txt
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
```
API
- `GET /health` — health check
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }`
## Docker
The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can be processed inside the container.
Notes
- Model will be downloaded on first run and can be several hundred MB.
- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`.
## API
- `GET /health` — health check and runtime capabilities
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
## Notes
- Model weights are downloaded on first run.
- OCR quality depends on scan quality and language support.
- Default OCR language is English (`eng`).
+107 -3
View File
@@ -1,16 +1,25 @@
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, File, HTTPException, UploadFile
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache
from PIL import Image
from pypdf import PdfReader
from docx import Document
import fitz
import hashlib
import io
import re
import torch
import pytesseract
app = FastAPI(title="Local Summarizer")
app = FastAPI(title="Local AI Service")
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000
MAX_CONTEXT_CHARS = 2200
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
OCR_LANGUAGES = "eng"
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
def _load_runtime():
@@ -48,6 +57,8 @@ async def health():
"device": str(device),
"gpu_available": GPU_AVAILABLE,
"gpu_name": GPU_NAME,
"ocr_available": True,
"ocr_languages": OCR_LANGUAGES,
}
@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
"aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
]
_MUST_HAVE_HINTS = [
"must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
]
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
out = "\n".join(lines).strip()
cache[key] = out
return {"summary": out, "cached": False}
def _normalize_text(value: str) -> str:
value = value.replace("\x00", " ")
return re.sub(r"\s+", " ", value).strip()
def _ocr_image(image: Image.Image) -> str:
if image.mode not in ("RGB", "L"):
image = image.convert("RGB")
text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
return _normalize_text(text)
def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
page_count = 0
extracted_pages = []
try:
reader = PdfReader(io.BytesIO(data))
page_count = len(reader.pages)
for page in reader.pages:
extracted_pages.append(page.extract_text() or "")
except Exception:
extracted_pages = []
text = _normalize_text("\n".join(extracted_pages))
if len(text) >= 80:
return text, False, page_count
doc = fitz.open(stream=data, filetype="pdf")
page_count = doc.page_count
ocr_pages = []
for page in doc:
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
image = Image.open(io.BytesIO(pix.tobytes("png")))
ocr_pages.append(_ocr_image(image))
doc.close()
return _normalize_text("\n".join(ocr_pages)), True, page_count
def _extract_docx_text(data: bytes) -> str:
document = Document(io.BytesIO(data))
parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
return _normalize_text("\n".join(parts))
def _extract_plain_text(data: bytes) -> str:
return _normalize_text(data.decode("utf-8", errors="ignore"))
@app.post("/extract-text")
async def extract_text(file: UploadFile = File(...)):
filename = file.filename or "document"
extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
data = await file.read()
if not data:
raise HTTPException(status_code=400, detail="The uploaded file was empty.")
if len(data) > MAX_EXTRACT_FILE_BYTES:
raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
try:
if extension in {".txt", ".md"}:
text = _extract_plain_text(data)
ocr_used = False
page_count = None
elif extension == ".docx":
text = _extract_docx_text(data)
ocr_used = False
page_count = None
elif extension == ".pdf":
text, ocr_used, page_count = _extract_pdf_text(data)
elif extension in IMAGE_EXTENSIONS:
image = Image.open(io.BytesIO(data))
text = _ocr_image(image)
ocr_used = True
page_count = 1
else:
raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
except HTTPException:
raise
except Exception as exc:
raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
if not text:
raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
return {
"text": text,
"ocr_used": ocr_used,
"content_type": file.content_type,
"page_count": page_count,
"characters": len(text),
"file_name": filename,
}
+5
View File
@@ -4,3 +4,8 @@ transformers==4.48.3
cachetools==5.5.2
pydantic==2.10.6
torch==2.6.0
pillow==11.1.0
pytesseract==0.3.13
pypdf==5.4.0
pymupdf==1.25.5
python-docx==1.1.2