Evolve summarizer into AI service with OCR support
This commit is contained in:
+1
-1
@@ -8,7 +8,7 @@ AUTH_GOOGLE_CLIENT_ID=CHANGE_ME_GOOGLE_CLIENT_ID
|
||||
GOOGLE_GMAIL_CLIENT_SECRET=CHANGE_ME_GOOGLE_OAUTH_CLIENT_SECRET
|
||||
# Optional. If omitted, the backend uses https://<your-domain>/api/gmail/oauth/callback
|
||||
GOOGLE_GMAIL_REDIRECT_URI=
|
||||
SUMMARIZER_BASE_URL=http://summarizer:8001
|
||||
AI_SERVICE_BASE_URL=http://ai-service:8001
|
||||
|
||||
# Optional: only needed if you want the UI to call a non-default API base URL.
|
||||
# In production the UI defaults to `/api`.
|
||||
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
- name: Test frontend
|
||||
working-directory: job-tracker-ui
|
||||
run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx
|
||||
run: npm test -- --watchAll=false --runInBand App.test.tsx confirm.test.tsx prompt.test.tsx dialog-flow.test.tsx confirm-flow.test.tsx attachments.test.tsx job-details-generated-drafts.test.tsx admin-system-page.test.tsx
|
||||
|
||||
- name: Build frontend
|
||||
working-directory: job-tracker-ui
|
||||
@@ -76,7 +76,7 @@ jobs:
|
||||
APP_BUILD_STAMP="$(date -u +'%Y-%m-%d %H:%M UTC')" \
|
||||
./deploy/deploy.sh
|
||||
docker compose ps
|
||||
docker compose exec -T summarizer python -c "import time, urllib.request; deadline=time.time()+60; last=None
|
||||
docker compose exec -T ai-service python -c "import time, urllib.request; deadline=time.time()+60; last=None
|
||||
for _ in range(30):
|
||||
try:
|
||||
urllib.request.urlopen('http://127.0.0.1:8001/health', timeout=5).read()
|
||||
|
||||
@@ -44,7 +44,7 @@ public sealed class AdminSystemController : ControllerBase
|
||||
DatabaseStatusDto Database,
|
||||
RuntimeStatusDto Runtime,
|
||||
AuthStatusDto Auth,
|
||||
SummarizerMetrics Summarizer
|
||||
AiServiceMetrics Ai
|
||||
);
|
||||
|
||||
private static string? NormalizeBuildMetadata(string? value)
|
||||
@@ -62,6 +62,7 @@ public sealed class AdminSystemController : ControllerBase
|
||||
return trimmed;
|
||||
}
|
||||
|
||||
[HttpPost("ai/probe")]
|
||||
[HttpPost("summarizer/probe")]
|
||||
public async Task<IActionResult> RunSummarizerProbe(CancellationToken cancellationToken)
|
||||
{
|
||||
@@ -79,7 +80,7 @@ public sealed class AdminSystemController : ControllerBase
|
||||
|
||||
var jobs = await _db.JobApplications.AsNoTracking().ToListAsync(cancellationToken);
|
||||
var companies = await _db.Companies.AsNoTracking().CountAsync(cancellationToken);
|
||||
var summarizer = await _summarizer.GetMetricsAsync(cancellationToken);
|
||||
var ai = await _summarizer.GetMetricsAsync(cancellationToken);
|
||||
|
||||
var version = NormalizeBuildMetadata(_cfg["App:Version"]);
|
||||
if (string.IsNullOrWhiteSpace(version))
|
||||
@@ -180,7 +181,7 @@ public sealed class AdminSystemController : ControllerBase
|
||||
GoogleConfigured: !string.IsNullOrWhiteSpace((_cfg["Auth:GoogleClientId"] ?? string.Empty).Trim()),
|
||||
GmailConfigured: gmailConfigured
|
||||
),
|
||||
Summarizer: summarizer
|
||||
Ai: ai
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1838,8 +1838,9 @@ Candidate master CV:
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
[HttpGet("ai-metrics")]
|
||||
[HttpGet("summarizer-metrics")]
|
||||
public async Task<ActionResult<SummarizerMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken)
|
||||
public async Task<ActionResult<AiServiceMetrics>> GetSummarizerMetrics(CancellationToken cancellationToken)
|
||||
{
|
||||
var metrics = await _summarizer.GetMetricsAsync(cancellationToken);
|
||||
return Ok(metrics);
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using JobTrackerApi.Services;
|
||||
using JobTrackerApi.Models;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Identity;
|
||||
@@ -18,15 +19,21 @@ public sealed class ProfileCvController : ControllerBase
|
||||
".md",
|
||||
".pdf",
|
||||
".docx",
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".webp",
|
||||
};
|
||||
|
||||
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
|
||||
|
||||
private readonly UserManager<ApplicationUser> _users;
|
||||
private readonly ISummarizerService _aiService;
|
||||
|
||||
public ProfileCvController(UserManager<ApplicationUser> users)
|
||||
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService)
|
||||
{
|
||||
_users = users;
|
||||
_aiService = aiService;
|
||||
}
|
||||
|
||||
[HttpPost("upload")]
|
||||
@@ -41,10 +48,34 @@ public sealed class ProfileCvController : ControllerBase
|
||||
var extension = Path.GetExtension(file.FileName ?? string.Empty);
|
||||
if (!AllowedExtensions.Contains(extension))
|
||||
{
|
||||
return BadRequest("Only .txt, .md, .pdf, and .docx CV imports are supported right now.");
|
||||
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
|
||||
}
|
||||
|
||||
var text = (await ExtractTextAsync(file, extension)).Trim();
|
||||
string text;
|
||||
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
if (canUseAiExtraction)
|
||||
{
|
||||
await using var uploadStream = file.OpenReadStream();
|
||||
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted);
|
||||
text = extracted?.Text?.Trim() ?? string.Empty;
|
||||
}
|
||||
else
|
||||
{
|
||||
text = string.Empty;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
text = (await ExtractTextAsync(file, extension)).Trim();
|
||||
}
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
return BadRequest("The uploaded CV file could not be read or was empty.");
|
||||
|
||||
@@ -116,10 +116,12 @@ builder.Services.AddHttpClient("jobimport")
|
||||
AutomaticDecompression = DecompressionMethods.All
|
||||
});
|
||||
|
||||
// Local summarizer service (FastAPI). Default URL can be overridden via configuration `Summarizer:BaseUrl`.
|
||||
builder.Services.AddHttpClient("summarizer", client =>
|
||||
// Local AI service (FastAPI). Supports summarization and OCR/text extraction.
|
||||
builder.Services.AddHttpClient("ai-service", client =>
|
||||
{
|
||||
var baseUrl = builder.Configuration["Summarizer:BaseUrl"] ?? "http://127.0.0.1:8001";
|
||||
var baseUrl = builder.Configuration["Ai:BaseUrl"]
|
||||
?? builder.Configuration["Summarizer:BaseUrl"]
|
||||
?? "http://127.0.0.1:8001";
|
||||
client.BaseAddress = new Uri(baseUrl);
|
||||
client.Timeout = TimeSpan.FromSeconds(30);
|
||||
});
|
||||
|
||||
@@ -13,12 +13,14 @@ using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace JobTrackerApi.Services
|
||||
{
|
||||
public sealed record SummarizerMetrics(
|
||||
public sealed record AiServiceMetrics(
|
||||
bool Healthy,
|
||||
string? Model,
|
||||
string? Device,
|
||||
bool? GpuAvailable,
|
||||
string? GpuName,
|
||||
bool? OcrAvailable,
|
||||
string? OcrLanguages,
|
||||
double? HealthLatencyMs,
|
||||
double? ProbeLatencyMs,
|
||||
DateTimeOffset? LastProbeAt,
|
||||
@@ -30,17 +32,36 @@ namespace JobTrackerApi.Services
|
||||
int CacheMisses,
|
||||
int Failures,
|
||||
double? AverageLatencyMs,
|
||||
int OcrRequests,
|
||||
int OcrFailures,
|
||||
double? AverageOcrLatencyMs,
|
||||
DateTimeOffset? LastOcrSuccessAt,
|
||||
DateTimeOffset? LastOcrFailureAt,
|
||||
DateTimeOffset? LastSuccessAt,
|
||||
DateTimeOffset? LastFailureAt,
|
||||
string? LastError
|
||||
);
|
||||
|
||||
public interface ISummarizerService
|
||||
public sealed record AiTextExtractionResult(
|
||||
string? Text,
|
||||
bool OcrUsed,
|
||||
string? ContentType,
|
||||
int? PageCount,
|
||||
int Characters,
|
||||
string? FileName
|
||||
);
|
||||
|
||||
public interface IAiService
|
||||
{
|
||||
Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30);
|
||||
Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40);
|
||||
Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default);
|
||||
Task RunProbeAsync(CancellationToken cancellationToken = default);
|
||||
Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default);
|
||||
Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
public interface ISummarizerService : IAiService
|
||||
{
|
||||
}
|
||||
|
||||
public class SummarizerService : ISummarizerService
|
||||
@@ -60,6 +81,11 @@ namespace JobTrackerApi.Services
|
||||
private DateTimeOffset? _lastProbeSuccessAt;
|
||||
private DateTimeOffset? _lastProbeFailureAt;
|
||||
private int _probeFailures;
|
||||
private int _ocrRequests;
|
||||
private int _ocrFailures;
|
||||
private long _totalOcrLatencyTicks;
|
||||
private DateTimeOffset? _lastOcrSuccessAt;
|
||||
private DateTimeOffset? _lastOcrFailureAt;
|
||||
private string? _lastError;
|
||||
|
||||
public SummarizerService(IHttpClientFactory httpFactory, IMemoryCache cache)
|
||||
@@ -78,22 +104,18 @@ namespace JobTrackerApi.Services
|
||||
public async Task<string?> SummarizeAsync(string text, int maxLength = 150, int minLength = 30)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return null;
|
||||
|
||||
return await SummarizeCoreAsync(text, maxLength, minLength);
|
||||
}
|
||||
|
||||
public Task<string?> SummarizeSectionAsync(string instruction, string text, int maxLength = 180, int minLength = 40)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(instruction) || string.IsNullOrWhiteSpace(text)) return Task.FromResult<string?>(null);
|
||||
|
||||
var composed = $"{instruction.Trim()}\n\n{text.Trim()}";
|
||||
return SummarizeCoreAsync(composed, maxLength, minLength);
|
||||
}
|
||||
|
||||
private async Task<string?> SummarizeCoreAsync(string text, int maxLength, int minLength)
|
||||
{
|
||||
// Use a deterministic content hash instead of string.GetHashCode() so cache keys
|
||||
// are collision-resistant and stable across process restarts.
|
||||
var key = BuildCacheKey(text, maxLength, minLength);
|
||||
Interlocked.Increment(ref _requests);
|
||||
|
||||
@@ -110,7 +132,7 @@ namespace JobTrackerApi.Services
|
||||
|
||||
Interlocked.Increment(ref _cacheMisses);
|
||||
|
||||
var client = _httpFactory.CreateClient("summarizer");
|
||||
var client = _httpFactory.CreateClient("ai-service");
|
||||
var payload = JsonSerializer.Serialize(new { text, max_length = maxLength, min_length = minLength });
|
||||
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
|
||||
var sw = Stopwatch.StartNew();
|
||||
@@ -152,10 +174,74 @@ namespace JobTrackerApi.Services
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<AiTextExtractionResult?> ExtractTextAsync(Stream stream, string fileName, string? contentType = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
if (string.IsNullOrWhiteSpace(fileName)) fileName = "document";
|
||||
|
||||
Interlocked.Increment(ref _ocrRequests);
|
||||
var client = _httpFactory.CreateClient("ai-service");
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
using var form = new MultipartFormDataContent();
|
||||
using var fileContent = new StreamContent(stream);
|
||||
if (!string.IsNullOrWhiteSpace(contentType))
|
||||
{
|
||||
fileContent.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue(contentType);
|
||||
}
|
||||
form.Add(fileContent, "file", fileName);
|
||||
|
||||
using var response = await client.PostAsync("/extract-text", form, cancellationToken);
|
||||
sw.Stop();
|
||||
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
Interlocked.Increment(ref _ocrFailures);
|
||||
lock (_metricsLock)
|
||||
{
|
||||
_lastOcrFailureAt = DateTimeOffset.UtcNow;
|
||||
_lastError = $"AI extraction returned {(int)response.StatusCode}.";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken);
|
||||
using var doc = await JsonDocument.ParseAsync(responseStream, cancellationToken: cancellationToken);
|
||||
var text = doc.RootElement.TryGetProperty("text", out var textEl) ? textEl.GetString() : null;
|
||||
var ocrUsed = doc.RootElement.TryGetProperty("ocr_used", out var ocrEl) && ocrEl.ValueKind is JsonValueKind.True or JsonValueKind.False && ocrEl.GetBoolean();
|
||||
var detectedContentType = doc.RootElement.TryGetProperty("content_type", out var contentTypeEl) ? contentTypeEl.GetString() : contentType;
|
||||
int? pageCount = doc.RootElement.TryGetProperty("page_count", out var pageCountEl) && pageCountEl.ValueKind == JsonValueKind.Number ? pageCountEl.GetInt32() : null;
|
||||
var characters = doc.RootElement.TryGetProperty("characters", out var charactersEl) && charactersEl.ValueKind == JsonValueKind.Number ? charactersEl.GetInt32() : (text?.Length ?? 0);
|
||||
var returnedFileName = doc.RootElement.TryGetProperty("file_name", out var fileNameEl) ? fileNameEl.GetString() : fileName;
|
||||
|
||||
lock (_metricsLock)
|
||||
{
|
||||
_lastOcrSuccessAt = DateTimeOffset.UtcNow;
|
||||
_lastError = null;
|
||||
}
|
||||
|
||||
return new AiTextExtractionResult(text, ocrUsed, detectedContentType, pageCount, characters, returnedFileName);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
Interlocked.Add(ref _totalOcrLatencyTicks, sw.ElapsedTicks);
|
||||
Interlocked.Increment(ref _ocrFailures);
|
||||
lock (_metricsLock)
|
||||
{
|
||||
_lastOcrFailureAt = DateTimeOffset.UtcNow;
|
||||
_lastError = ex.Message;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task RunProbeAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
const string probeText = "Summarizer latency probe for job tracker telemetry.";
|
||||
var client = _httpFactory.CreateClient("summarizer");
|
||||
const string probeText = "AI service latency probe for Jobbjakt telemetry.";
|
||||
var client = _httpFactory.CreateClient("ai-service");
|
||||
var payload = JsonSerializer.Serialize(new { text = probeText, max_length = 48, min_length = 12 });
|
||||
using var content = new StringContent(payload, Encoding.UTF8, "application/json");
|
||||
var sw = Stopwatch.StartNew();
|
||||
@@ -215,13 +301,15 @@ namespace JobTrackerApi.Services
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<SummarizerMetrics> GetMetricsAsync(CancellationToken cancellationToken = default)
|
||||
public async Task<AiServiceMetrics> GetMetricsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var client = _httpFactory.CreateClient("summarizer");
|
||||
var client = _httpFactory.CreateClient("ai-service");
|
||||
string? model = null;
|
||||
string? device = null;
|
||||
bool? gpuAvailable = null;
|
||||
string? gpuName = null;
|
||||
bool? ocrAvailable = null;
|
||||
string? ocrLanguages = null;
|
||||
double? healthLatencyMs = null;
|
||||
var healthy = false;
|
||||
string? healthError = null;
|
||||
@@ -238,25 +326,12 @@ namespace JobTrackerApi.Services
|
||||
{
|
||||
using var stream = await res.Content.ReadAsStreamAsync(cancellationToken);
|
||||
using var doc = await JsonDocument.ParseAsync(stream, cancellationToken: cancellationToken);
|
||||
if (doc.RootElement.TryGetProperty("model", out var modelEl))
|
||||
{
|
||||
model = modelEl.GetString();
|
||||
}
|
||||
|
||||
if (doc.RootElement.TryGetProperty("device", out var deviceEl))
|
||||
{
|
||||
device = deviceEl.GetString();
|
||||
}
|
||||
|
||||
if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False)
|
||||
{
|
||||
gpuAvailable = gpuAvailableEl.GetBoolean();
|
||||
}
|
||||
|
||||
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl))
|
||||
{
|
||||
gpuName = gpuNameEl.GetString();
|
||||
}
|
||||
if (doc.RootElement.TryGetProperty("model", out var modelEl)) model = modelEl.GetString();
|
||||
if (doc.RootElement.TryGetProperty("device", out var deviceEl)) device = deviceEl.GetString();
|
||||
if (doc.RootElement.TryGetProperty("gpu_available", out var gpuAvailableEl) && gpuAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) gpuAvailable = gpuAvailableEl.GetBoolean();
|
||||
if (doc.RootElement.TryGetProperty("gpu_name", out var gpuNameEl)) gpuName = gpuNameEl.GetString();
|
||||
if (doc.RootElement.TryGetProperty("ocr_available", out var ocrAvailableEl) && ocrAvailableEl.ValueKind is JsonValueKind.True or JsonValueKind.False) ocrAvailable = ocrAvailableEl.GetBoolean();
|
||||
if (doc.RootElement.TryGetProperty("ocr_languages", out var ocrLanguagesEl)) ocrLanguages = ocrLanguagesEl.GetString();
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -273,6 +348,9 @@ namespace JobTrackerApi.Services
|
||||
var cacheMisses = Volatile.Read(ref _cacheMisses);
|
||||
var failures = Volatile.Read(ref _failures);
|
||||
var totalLatencyTicks = Volatile.Read(ref _totalLatencyTicks);
|
||||
var ocrRequests = Volatile.Read(ref _ocrRequests);
|
||||
var ocrFailures = Volatile.Read(ref _ocrFailures);
|
||||
var totalOcrLatencyTicks = Volatile.Read(ref _totalOcrLatencyTicks);
|
||||
|
||||
DateTimeOffset? lastSuccessAt;
|
||||
DateTimeOffset? lastFailureAt;
|
||||
@@ -280,6 +358,8 @@ namespace JobTrackerApi.Services
|
||||
DateTimeOffset? lastProbeAt;
|
||||
DateTimeOffset? lastProbeSuccessAt;
|
||||
DateTimeOffset? lastProbeFailureAt;
|
||||
DateTimeOffset? lastOcrSuccessAt;
|
||||
DateTimeOffset? lastOcrFailureAt;
|
||||
string? lastError;
|
||||
lock (_metricsLock)
|
||||
{
|
||||
@@ -289,6 +369,8 @@ namespace JobTrackerApi.Services
|
||||
lastProbeAt = _lastProbeAt;
|
||||
lastProbeSuccessAt = _lastProbeSuccessAt;
|
||||
lastProbeFailureAt = _lastProbeFailureAt;
|
||||
lastOcrSuccessAt = _lastOcrSuccessAt;
|
||||
lastOcrFailureAt = _lastOcrFailureAt;
|
||||
lastError = _lastError;
|
||||
}
|
||||
|
||||
@@ -297,16 +379,17 @@ namespace JobTrackerApi.Services
|
||||
lastError = healthError;
|
||||
}
|
||||
|
||||
double? averageLatencyMs = requests > 0
|
||||
? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1)
|
||||
: null;
|
||||
double? averageLatencyMs = requests > 0 ? Math.Round(TimeSpan.FromTicks(totalLatencyTicks).TotalMilliseconds / requests, 1) : null;
|
||||
double? averageOcrLatencyMs = ocrRequests > 0 ? Math.Round(TimeSpan.FromTicks(totalOcrLatencyTicks).TotalMilliseconds / ocrRequests, 1) : null;
|
||||
|
||||
return new SummarizerMetrics(
|
||||
return new AiServiceMetrics(
|
||||
Healthy: healthy,
|
||||
Model: model,
|
||||
Device: device,
|
||||
GpuAvailable: gpuAvailable,
|
||||
GpuName: gpuName,
|
||||
OcrAvailable: ocrAvailable,
|
||||
OcrLanguages: ocrLanguages,
|
||||
HealthLatencyMs: healthLatencyMs,
|
||||
ProbeLatencyMs: probeLatencyMs,
|
||||
LastProbeAt: lastProbeAt,
|
||||
@@ -318,6 +401,11 @@ namespace JobTrackerApi.Services
|
||||
CacheMisses: cacheMisses,
|
||||
Failures: failures,
|
||||
AverageLatencyMs: averageLatencyMs,
|
||||
OcrRequests: ocrRequests,
|
||||
OcrFailures: ocrFailures,
|
||||
AverageOcrLatencyMs: averageOcrLatencyMs,
|
||||
LastOcrSuccessAt: lastOcrSuccessAt,
|
||||
LastOcrFailureAt: lastOcrFailureAt,
|
||||
LastSuccessAt: lastSuccessAt,
|
||||
LastFailureAt: lastFailureAt,
|
||||
LastError: lastError
|
||||
@@ -340,14 +428,11 @@ namespace JobTrackerApi.Services
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
var enabled = _cfg.GetValue("Summarizer:ProbeEnabled", true);
|
||||
if (!enabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
var enabled = _cfg.GetValue("Ai:ProbeEnabled", _cfg.GetValue("Summarizer:ProbeEnabled", true));
|
||||
if (!enabled) return;
|
||||
|
||||
var intervalSeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300), 30, 3600);
|
||||
var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15), 0, 600);
|
||||
var intervalSeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeIntervalSeconds", _cfg.GetValue("Summarizer:ProbeIntervalSeconds", 300)), 30, 3600);
|
||||
var initialDelaySeconds = Math.Clamp(_cfg.GetValue("Ai:ProbeInitialDelaySeconds", _cfg.GetValue("Summarizer:ProbeInitialDelaySeconds", 15)), 0, 600);
|
||||
|
||||
if (initialDelaySeconds > 0)
|
||||
{
|
||||
@@ -360,8 +445,8 @@ namespace JobTrackerApi.Services
|
||||
try
|
||||
{
|
||||
using var scope = _scopeFactory.CreateScope();
|
||||
var summarizer = scope.ServiceProvider.GetRequiredService<ISummarizerService>();
|
||||
await summarizer.RunProbeAsync(stoppingToken);
|
||||
var aiService = scope.ServiceProvider.GetRequiredService<ISummarizerService>();
|
||||
await aiService.RunProbeAsync(stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
@@ -369,7 +454,7 @@ namespace JobTrackerApi.Services
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Summarizer latency probe failed.");
|
||||
_logger.LogWarning(ex, "AI service latency probe failed.");
|
||||
}
|
||||
}
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken));
|
||||
|
||||
@@ -12,7 +12,7 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
|
||||
- History/event trail per application (created, status changes, follow-up set, delete/restore)
|
||||
- Export jobs to JSON/CSV + daily scheduled JSON export
|
||||
- Optional “job import” preview from supported job sites (plugins) + optional translation to English
|
||||
- Optional local summarizer service for short/full descriptions
|
||||
- Optional local AI service for short/full descriptions
|
||||
- Optional Google sign-in (Google ID tokens) to protect the API
|
||||
|
||||
## Architecture
|
||||
@@ -21,11 +21,11 @@ Job Tracker is a simple, self-hosted app for tracking job applications with a Re
|
||||
- `JobTrackerApi/`: ASP.NET Core API (defaults to `http://localhost:5202`)
|
||||
- SQLite DB file: defaults to `JobTrackerApi/jobtracker.db` unless `Data:Root` / connection string overrides it
|
||||
- Attachments: stored on disk under `DataRoot/Attachments/<jobId>/...`
|
||||
- Optional local summarizer service: `tools/summarizer/` (FastAPI) used by the API via `Summarizer:BaseUrl`
|
||||
- Optional local AI service: `tools/summarizer/` (FastAPI) used by the API via `Ai:BaseUrl`
|
||||
|
||||
## Quickstart (Docker)
|
||||
|
||||
This runs: frontend (nginx), backend API, and the summarizer service.
|
||||
This runs: frontend (nginx), backend API, and the AI service.
|
||||
|
||||
1) Create a `.env` file next to `docker-compose.yml` (you can start from `.env.example`).
|
||||
|
||||
@@ -43,7 +43,7 @@ docker compose up --build
|
||||
|
||||
- .NET SDK `9.x` (API targets `net9.0`)
|
||||
- Node.js (for the UI)
|
||||
- (Optional) Python 3.x if running the summarizer without Docker
|
||||
- (Optional) Python 3.x if running the AI service without Docker
|
||||
|
||||
### 1) Run the API
|
||||
|
||||
@@ -65,14 +65,14 @@ npm start
|
||||
|
||||
The UI defaults to calling `http://localhost:5202/api` when running on localhost (see `job-tracker-ui/src/api.ts`).
|
||||
|
||||
### 3) (Optional) Run the summarizer
|
||||
### 3) (Optional) Run the AI service
|
||||
|
||||
The API calls a local FastAPI service to generate summaries. If it’s not running, the app still works (summary generation may be empty / best-effort).
|
||||
|
||||
With Docker (recommended):
|
||||
|
||||
```bash
|
||||
docker compose up --build summarizer
|
||||
docker compose up --build ai-service
|
||||
```
|
||||
|
||||
Or run directly from `tools/summarizer/` (see `tools/summarizer/README.md`).
|
||||
@@ -87,7 +87,7 @@ Common keys:
|
||||
- `Data:Root`: folder for the SQLite DB + exports (defaults to API content root)
|
||||
- `Data:AttachmentsRoot`: override attachments folder (defaults to `<Data:Root>/Attachments`)
|
||||
- `Cors:Origins`: list of allowed origins (defaults to `http://localhost:3000`; use `"*"` to allow all)
|
||||
- `Summarizer:BaseUrl`: summarizer base URL (default `http://127.0.0.1:8001`)
|
||||
- `Ai:BaseUrl`: AI service base URL (default `http://127.0.0.1:8001`)
|
||||
- `Exports:DailyEnabled`: enable/disable daily export background job
|
||||
- `Exports:DailyFolder`: export destination (relative to `Data:Root` if not absolute)
|
||||
- `Exports:DailyHourLocal`: local hour (0–23) when the daily export runs
|
||||
@@ -109,7 +109,7 @@ Common keys:
|
||||
- `Email:SmtpUser`: SMTP username (often your Gmail address)
|
||||
- `Email:SmtpPassword`: SMTP password (for Gmail: use an App Password)
|
||||
- `Email:From`: from address (default: `Email:SmtpUser`)
|
||||
- `Email:FromName`: from name (default: `Job Tracker`)
|
||||
- `Email:FromName`: from name (default: `Jobbjakt`)
|
||||
|
||||
### UI settings
|
||||
|
||||
|
||||
+4
-2
@@ -51,7 +51,9 @@ AUTH_JWT_KEY=replace_with_long_random_secret
|
||||
AUTH_ADMIN_EMAIL=you@example.com
|
||||
AUTH_ADMIN_PASSWORD=replace_with_strong_password
|
||||
APP_PUBLIC_BASE_URL=https://your-domain.example
|
||||
SUMMARIZER_BASE_URL=http://summarizer:8001
|
||||
AI_SERVICE_BASE_URL=http://ai-service:8001
|
||||
# Optional backward-compatible alias if older config still references the previous name:
|
||||
SUMMARIZER_BASE_URL=http://ai-service:8001
|
||||
```
|
||||
|
||||
## Database recommendation
|
||||
@@ -89,5 +91,5 @@ If this app is going to be a real production service on Ubuntu:
|
||||
- confirm reverse proxy routes to the frontend correctly
|
||||
- confirm API auth/login works with production config
|
||||
- confirm backend can connect to MariaDB
|
||||
- confirm summarizer container is reachable from backend
|
||||
- confirm AI service container is reachable from backend
|
||||
- confirm reminder and admin/system pages load
|
||||
|
||||
+3
-2
@@ -23,7 +23,8 @@ services:
|
||||
- Auth__GoogleClientId=${AUTH_GOOGLE_CLIENT_ID}
|
||||
- Google__GmailClientSecret=${GOOGLE_GMAIL_CLIENT_SECRET}
|
||||
- Google__GmailRedirectUri=${GOOGLE_GMAIL_REDIRECT_URI}
|
||||
- Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://summarizer:8001}
|
||||
- Ai__BaseUrl=${AI_SERVICE_BASE_URL:-http://ai-service:8001}
|
||||
- Summarizer__BaseUrl=${SUMMARIZER_BASE_URL:-http://ai-service:8001}
|
||||
# Email (SMTP)
|
||||
# Build metadata should be resolved before deployment. Examples:
|
||||
# APP_VERSION=1.0.0
|
||||
@@ -66,7 +67,7 @@ services:
|
||||
- shared_services
|
||||
restart: unless-stopped
|
||||
|
||||
summarizer:
|
||||
ai-service:
|
||||
build:
|
||||
context: ./tools/summarizer
|
||||
dockerfile: Dockerfile
|
||||
|
||||
@@ -2,6 +2,16 @@
|
||||
|
||||
Last updated: 2026-03-23
|
||||
|
||||
## AI Service / OCR
|
||||
- [x] Reframe user-facing "summarizer" status and docs toward an AI service
|
||||
- [x] Add self-hosted OCR/text extraction endpoint to the local AI service
|
||||
- [x] Add backend AI-service text extraction integration for profile CV uploads
|
||||
- [x] Add OCR support for supported image CV uploads (`png`, `jpg`, `jpeg`, `webp`)
|
||||
- [x] Add AI service latency/OCR telemetry to the system page
|
||||
- [x] Add frontend test coverage for AI service status rendering
|
||||
- [ ] Extend AI extraction to job attachment ingestion
|
||||
- [ ] Consider full internal service/class rename from `Summarizer*` to `AiService*`
|
||||
|
||||
## Build / UI Issues
|
||||
- [x] Fix visible build error text appearing on page load/footer
|
||||
- [x] Resolve naming inconsistency: `jobtrack` → `Jobbjakt`
|
||||
@@ -32,7 +42,7 @@ Last updated: 2026-03-23
|
||||
- [x] Add zoom in/out support for image cropping
|
||||
- [x] Use square cropped avatar output
|
||||
- [x] Add CV upload support
|
||||
- [ ] Verify/complete OCR/text extraction for uploaded CV PDFs
|
||||
- [x] Verify/complete OCR/text extraction for uploaded CV PDFs
|
||||
|
||||
## Settings & System
|
||||
- [x] Restore missing follow-up days settings
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
import React from 'react';
|
||||
import { render, screen, waitFor } from '@testing-library/react';
|
||||
|
||||
import AdminSystemPage from './pages/AdminSystemPage';
|
||||
import { I18nProvider } from './i18n/I18nProvider';
|
||||
import { api } from './api';
|
||||
|
||||
const mockedApi = api as jest.Mocked<typeof api>;
|
||||
|
||||
describe('AdminSystemPage', () => {
|
||||
it('renders AI service health, latency, and OCR readiness', async () => {
|
||||
mockedApi.get.mockImplementation((url: string) => {
|
||||
if (url === '/admin/system') {
|
||||
return Promise.resolve({
|
||||
data: {
|
||||
environment: 'Production',
|
||||
contentRoot: '/app',
|
||||
version: '1.2.3',
|
||||
commitSha: 'abc1234',
|
||||
buildStamp: '2026-03-23 11:00 UTC',
|
||||
storage: { dataRoot: '/data', dbPath: '/data/jobtracker.db', dbExists: true, dbSizeBytes: 2048, companyCount: 3, jobCount: 7, deletedCount: 1 },
|
||||
email: { enabled: true, host: 'smtp.example.test', port: 587, enableSsl: true, from: 'noreply@example.test', fromName: 'Jobbjakt' },
|
||||
database: { provider: 'mariadb', looksConfigured: true, canConnect: true, target: 'server=db', usesFileStorage: false, warning: null },
|
||||
runtime: { framework: '.NET 9', osDescription: 'Linux', processArchitecture: 'X64', machineName: 'app-01' },
|
||||
auth: { required: true, hasJwtKey: true, googleConfigured: true, gmailConfigured: true },
|
||||
ai: {
|
||||
healthy: true,
|
||||
model: 'distilbart',
|
||||
device: 'cpu',
|
||||
gpuAvailable: false,
|
||||
gpuName: null,
|
||||
ocrAvailable: true,
|
||||
ocrLanguages: 'eng',
|
||||
healthLatencyMs: 12.4,
|
||||
probeLatencyMs: 25.8,
|
||||
lastProbeAt: '2026-03-23T10:00:00Z',
|
||||
lastProbeSuccessAt: '2026-03-23T10:00:00Z',
|
||||
lastProbeFailureAt: null,
|
||||
probeFailures: 0,
|
||||
requests: 18,
|
||||
cacheHits: 9,
|
||||
cacheMisses: 9,
|
||||
failures: 0,
|
||||
averageLatencyMs: 42.2,
|
||||
ocrRequests: 5,
|
||||
ocrFailures: 0,
|
||||
averageOcrLatencyMs: 88.4,
|
||||
lastOcrSuccessAt: '2026-03-23T10:05:00Z',
|
||||
lastOcrFailureAt: null,
|
||||
lastSuccessAt: '2026-03-23T10:04:00Z',
|
||||
lastFailureAt: null,
|
||||
lastError: null,
|
||||
},
|
||||
},
|
||||
} as any);
|
||||
}
|
||||
return Promise.resolve({ data: {} } as any);
|
||||
});
|
||||
|
||||
render(
|
||||
<I18nProvider>
|
||||
<AdminSystemPage />
|
||||
</I18nProvider>,
|
||||
);
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText('AI service')).toBeTruthy();
|
||||
});
|
||||
|
||||
expect(screen.getByText(/25.8 ms probe/i)).toBeTruthy();
|
||||
expect(screen.getByText('OCR eng')).toBeTruthy();
|
||||
expect(screen.getByText('OCR avg latency')).toBeTruthy();
|
||||
expect(screen.getByText('88.4 ms')).toBeTruthy();
|
||||
});
|
||||
});
|
||||
@@ -171,7 +171,7 @@ export const translations = {
|
||||
profileHeadline: "Profile headline",
|
||||
profileHeadlineHelp: "Stored only in this browser to personalize your workspace.",
|
||||
profileMasterCv: "Master CV",
|
||||
profileMasterCvBody: "Upload a PDF, DOCX, plain text file, or markdown file. The app extracts text where supported and populates your master CV text for tailoring and outreach.",
|
||||
profileMasterCvBody: "Upload a PDF, DOCX, plain text file, markdown file, or image scan. The AI service extracts text where possible and falls back to OCR for supported scanned files.",
|
||||
profileUploadCv: "Upload CV",
|
||||
profileUploading: "Uploading...",
|
||||
profileCopyCvText: "Copy CV text",
|
||||
@@ -179,7 +179,7 @@ export const translations = {
|
||||
profileCvUploadFailed: "Failed to upload CV.",
|
||||
profileCvTextLabel: "Profile CV / master resume text",
|
||||
profileCvTextHelp: "Keep this updated and specific. Include recent roles, tools, achievements, measurable outcomes, and the work you want to be hired for next. If extraction misses something, edit it here manually.",
|
||||
profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD.",
|
||||
profileCvPreferredUploads: "Supported uploads: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.",
|
||||
profileSaveChanges: "Save changes",
|
||||
profileUpdated: "Profile updated.",
|
||||
profileUpdateFailed: "Failed to update profile.",
|
||||
@@ -272,7 +272,7 @@ export const translations = {
|
||||
adminUsersCreated: "User created.",
|
||||
adminUsersCreateFailed: "Failed to create user.",
|
||||
adminSystemTitle: "System status",
|
||||
adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, and summarizer health.",
|
||||
adminSystemSubtitle: "Production diagnostics for runtime, database, auth, email, AI service health, and OCR readiness.",
|
||||
adminSystemRunProbe: "Run probe now",
|
||||
adminSystemRunningProbe: "Running probe...",
|
||||
adminSystemRefresh: "Refresh",
|
||||
@@ -284,13 +284,13 @@ export const translations = {
|
||||
adminSystemSmtp: "SMTP",
|
||||
adminSystemEnabled: "Enabled",
|
||||
adminSystemDisabled: "Disabled",
|
||||
adminSystemSummarizer: "Summarizer",
|
||||
adminSystemSummarizer: "AI service",
|
||||
adminSystemHealthy: "Healthy",
|
||||
adminSystemNoLatencyData: "No latency data",
|
||||
adminSystemDatabaseStorage: "Database and storage",
|
||||
adminSystemRuntimeAuth: "Runtime and auth",
|
||||
adminSystemEmailConfig: "Email configuration",
|
||||
adminSystemSummarizerRuntime: "Summarizer runtime",
|
||||
adminSystemSummarizerRuntime: "AI runtime",
|
||||
adminSystemSmtpTest: "SMTP test email",
|
||||
adminSystemSmtpTestBody: "Send a quick delivery check using the configured SMTP settings. Leave the recipient blank to use your admin email.",
|
||||
adminSystemRecipientEmail: "Recipient email",
|
||||
@@ -299,7 +299,7 @@ export const translations = {
|
||||
adminSystemMessage: "Message",
|
||||
adminSystemSendTestEmail: "Send test email",
|
||||
adminSystemSending: "Sending...",
|
||||
adminSystemSummarizerTelemetry: "Summarizer telemetry",
|
||||
adminSystemSummarizerTelemetry: "AI service telemetry",
|
||||
adminSystemDatabaseConnected: "Database connected",
|
||||
adminSystemDatabaseIssue: "Database issue",
|
||||
adminSystemAuthEnforced: "Auth enforced",
|
||||
@@ -591,7 +591,7 @@ export const translations = {
|
||||
profileHeadline: "Profiloverskrift",
|
||||
profileHeadlineHelp: "Lagres bare i denne nettleseren for å gjøre arbeidsområdet mer personlig.",
|
||||
profileMasterCv: "Hoved-CV",
|
||||
profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil eller markdown-fil. Appen henter ut tekst der det støttes og fyller inn hoved-CV-en din for tilpasning og kontakt.",
|
||||
profileMasterCvBody: "Last opp en PDF, DOCX, ren tekstfil, markdown-fil eller et bildeskann. AI-tjenesten henter ut tekst der det er mulig og faller tilbake til OCR for støttede skannede filer.",
|
||||
profileUploadCv: "Last opp CV",
|
||||
profileUploading: "Laster opp...",
|
||||
profileCopyCvText: "Kopier CV-tekst",
|
||||
@@ -599,7 +599,7 @@ export const translations = {
|
||||
profileCvUploadFailed: "Kunne ikke laste opp CV.",
|
||||
profileCvTextLabel: "Profil-CV / hovedtekst for CV",
|
||||
profileCvTextHelp: "Hold denne oppdatert og konkret. Ta med nylige roller, verktøy, prestasjoner, målbare resultater og arbeidet du vil bli ansatt for neste gang. Hvis tekstuttrekket mangler noe, kan du redigere manuelt her.",
|
||||
profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD.",
|
||||
profileCvPreferredUploads: "Støttede opplastinger: PDF, DOCX, TXT, MD, PNG, JPG, JPEG, WEBP.",
|
||||
profileSaveChanges: "Lagre endringer",
|
||||
profileUpdated: "Profil oppdatert.",
|
||||
profileUpdateFailed: "Kunne ikke oppdatere profil.",
|
||||
@@ -692,7 +692,7 @@ export const translations = {
|
||||
adminUsersCreated: "Bruker opprettet.",
|
||||
adminUsersCreateFailed: "Kunne ikke opprette bruker.",
|
||||
adminSystemTitle: "Systemstatus",
|
||||
adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post og oppsummeringshelse.",
|
||||
adminSystemSubtitle: "Produksjonsdiagnostikk for kjøretid, database, autentisering, e-post, AI-tjenestehelse og OCR-beredskap.",
|
||||
adminSystemRunProbe: "Kjør probe nå",
|
||||
adminSystemRunningProbe: "Kjører probe...",
|
||||
adminSystemRefresh: "Oppdater",
|
||||
@@ -704,13 +704,13 @@ export const translations = {
|
||||
adminSystemSmtp: "SMTP",
|
||||
adminSystemEnabled: "Aktivert",
|
||||
adminSystemDisabled: "Deaktivert",
|
||||
adminSystemSummarizer: "Oppsummerer",
|
||||
adminSystemSummarizer: "AI-tjeneste",
|
||||
adminSystemHealthy: "Frisk",
|
||||
adminSystemNoLatencyData: "Ingen latensdata",
|
||||
adminSystemDatabaseStorage: "Database og lagring",
|
||||
adminSystemRuntimeAuth: "Kjøretid og autentisering",
|
||||
adminSystemEmailConfig: "E-postkonfigurasjon",
|
||||
adminSystemSummarizerRuntime: "Oppsummeringskjøretid",
|
||||
adminSystemSummarizerRuntime: "AI-kjøretid",
|
||||
adminSystemSmtpTest: "SMTP-test e-post",
|
||||
adminSystemSmtpTestBody: "Send en rask leveringssjekk med de konfigurerte SMTP-innstillingene. La mottakeren stå tom for å bruke admin-eposten din.",
|
||||
adminSystemRecipientEmail: "Mottaker e-post",
|
||||
@@ -719,7 +719,7 @@ export const translations = {
|
||||
adminSystemMessage: "Melding",
|
||||
adminSystemSendTestEmail: "Send test-e-post",
|
||||
adminSystemSending: "Sender...",
|
||||
adminSystemSummarizerTelemetry: "Oppsummeringstelemetri",
|
||||
adminSystemSummarizerTelemetry: "AI-tjenestetelemetri",
|
||||
adminSystemDatabaseConnected: "Database tilkoblet",
|
||||
adminSystemDatabaseIssue: "Databaseproblem",
|
||||
adminSystemAuthEnforced: "Autentisering påkrevd",
|
||||
|
||||
@@ -14,12 +14,14 @@ import {
|
||||
import { api, getApiErrorMessage } from "../api";
|
||||
import { useI18n } from "../i18n/I18nProvider";
|
||||
|
||||
type SummarizerMetrics = {
|
||||
type AiServiceMetrics = {
|
||||
healthy: boolean;
|
||||
model?: string | null;
|
||||
device?: string | null;
|
||||
gpuAvailable?: boolean;
|
||||
gpuName?: string | null;
|
||||
ocrAvailable?: boolean | null;
|
||||
ocrLanguages?: string | null;
|
||||
healthLatencyMs?: number | null;
|
||||
probeLatencyMs?: number | null;
|
||||
lastProbeAt?: string | null;
|
||||
@@ -31,6 +33,11 @@ type SummarizerMetrics = {
|
||||
cacheMisses: number;
|
||||
failures: number;
|
||||
averageLatencyMs?: number | null;
|
||||
ocrRequests: number;
|
||||
ocrFailures: number;
|
||||
averageOcrLatencyMs?: number | null;
|
||||
lastOcrSuccessAt?: string | null;
|
||||
lastOcrFailureAt?: string | null;
|
||||
lastSuccessAt?: string | null;
|
||||
lastFailureAt?: string | null;
|
||||
lastError?: string | null;
|
||||
@@ -79,7 +86,7 @@ type SystemStatus = {
|
||||
googleConfigured: boolean;
|
||||
gmailConfigured: boolean;
|
||||
};
|
||||
summarizer: SummarizerMetrics;
|
||||
ai: AiServiceMetrics;
|
||||
};
|
||||
|
||||
function formatBytes(bytes?: number | null) {
|
||||
@@ -148,10 +155,10 @@ export default function AdminSystemPage() {
|
||||
return "success" as const;
|
||||
}, [status]);
|
||||
|
||||
const summarizerTone = useMemo(() => {
|
||||
const aiTone = useMemo(() => {
|
||||
if (!status) return "default" as const;
|
||||
if (!status.summarizer.healthy) return "error" as const;
|
||||
if (status.summarizer.probeFailures > 0 || status.summarizer.failures > 0) return "warning" as const;
|
||||
if (!status.ai.healthy) return "error" as const;
|
||||
if (status.ai.probeFailures > 0 || status.ai.failures > 0 || (status.ai.ocrFailures ?? 0) > 0) return "warning" as const;
|
||||
return "success" as const;
|
||||
}, [status]);
|
||||
|
||||
@@ -184,10 +191,10 @@ export default function AdminSystemPage() {
|
||||
setRunningProbe(true);
|
||||
setError(null);
|
||||
try {
|
||||
await api.post("/admin/system/summarizer/probe");
|
||||
await api.post("/admin/system/ai/probe");
|
||||
await load();
|
||||
} catch (e: any) {
|
||||
setError(getApiErrorMessage(e, "Failed to run summarizer probe."));
|
||||
setError(getApiErrorMessage(e, "Failed to run AI service probe."));
|
||||
} finally {
|
||||
setRunningProbe(false);
|
||||
}
|
||||
@@ -204,7 +211,7 @@ export default function AdminSystemPage() {
|
||||
|
||||
{error ? <Alert severity="error">{error}</Alert> : null}
|
||||
{status?.database.warning ? <Alert severity={status.database.canConnect ? "warning" : "error"}>{status.database.warning}</Alert> : null}
|
||||
{status?.summarizer.lastError ? <Alert severity={status.summarizer.healthy ? "warning" : "error"}>{status.summarizer.lastError}</Alert> : null}
|
||||
{status?.ai.lastError ? <Alert severity={status.ai.healthy ? "warning" : "error"}>{status.ai.lastError}</Alert> : null}
|
||||
|
||||
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr", md: "repeat(4, 1fr)" }, gap: 2 }}>
|
||||
<SummaryCard
|
||||
@@ -226,13 +233,13 @@ export default function AdminSystemPage() {
|
||||
/>
|
||||
<SummaryCard
|
||||
title={t("adminSystemSummarizer")}
|
||||
value={status?.summarizer.healthy ? t("adminSystemHealthy") : t("adminSystemOffline")}
|
||||
subtitle={status?.summarizer.probeLatencyMs != null
|
||||
? `${status.summarizer.probeLatencyMs} ms probe · ${status.summarizer.device || "unknown device"}`
|
||||
: status?.summarizer.healthLatencyMs != null
|
||||
? `${status.summarizer.healthLatencyMs} ms health · ${status.summarizer.device || "unknown device"}`
|
||||
value={status?.ai.healthy ? t("adminSystemHealthy") : t("adminSystemOffline")}
|
||||
subtitle={status?.ai.probeLatencyMs != null
|
||||
? `${status.ai.probeLatencyMs} ms probe · ${status.ai.device || "unknown device"}`
|
||||
: status?.ai.healthLatencyMs != null
|
||||
? `${status.ai.healthLatencyMs} ms health · ${status.ai.device || "unknown device"}`
|
||||
: t("adminSystemNoLatencyData")}
|
||||
tone={summarizerTone}
|
||||
tone={aiTone}
|
||||
/>
|
||||
</Box>
|
||||
|
||||
@@ -288,15 +295,15 @@ export default function AdminSystemPage() {
|
||||
<Paper sx={{ p: 2, borderRadius: 3 }}>
|
||||
<Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerRuntime")}</Typography>
|
||||
<Stack spacing={0.75}>
|
||||
<DetailRow label="Model" value={status?.summarizer.model || "-"} />
|
||||
<DetailRow label="Device" value={status?.summarizer.device || "-"} />
|
||||
<DetailRow label="GPU available" value={status?.summarizer.gpuAvailable ? "Yes" : "No"} />
|
||||
<DetailRow label="GPU name" value={status?.summarizer.gpuName || "-"} />
|
||||
<DetailRow label="Health latency" value={status?.summarizer.healthLatencyMs != null ? `${status.summarizer.healthLatencyMs} ms` : "-"} />
|
||||
<DetailRow label="Probe latency" value={status?.summarizer.probeLatencyMs != null ? `${status.summarizer.probeLatencyMs} ms` : "-"} />
|
||||
<DetailRow label="Last probe" value={formatDate(status?.summarizer.lastProbeAt)} />
|
||||
<DetailRow label="Last successful probe" value={formatDate(status?.summarizer.lastProbeSuccessAt)} />
|
||||
<DetailRow label="Last summarization success" value={formatDate(status?.summarizer.lastSuccessAt)} />
|
||||
<DetailRow label="Model" value={status?.ai.model || "-"} />
|
||||
<DetailRow label="Device" value={status?.ai.device || "-"} />
|
||||
<DetailRow label="GPU available" value={status?.ai.gpuAvailable ? "Yes" : "No"} />
|
||||
<DetailRow label="GPU name" value={status?.ai.gpuName || "-"} />
|
||||
<DetailRow label="Health latency" value={status?.ai.healthLatencyMs != null ? `${status.ai.healthLatencyMs} ms` : "-"} />
|
||||
<DetailRow label="Probe latency" value={status?.ai.probeLatencyMs != null ? `${status.ai.probeLatencyMs} ms` : "-"} />
|
||||
<DetailRow label="Last probe" value={formatDate(status?.ai.lastProbeAt)} />
|
||||
<DetailRow label="Last successful probe" value={formatDate(status?.ai.lastProbeSuccessAt)} />
|
||||
<DetailRow label="Last summarization success" value={formatDate(status?.ai.lastSuccessAt)} />
|
||||
</Stack>
|
||||
</Paper>
|
||||
</Box>
|
||||
@@ -320,20 +327,23 @@ export default function AdminSystemPage() {
|
||||
|
||||
<Paper sx={{ p: 2, borderRadius: 3 }}>
|
||||
<Typography variant="h6" sx={{ fontWeight: 900, mb: 1 }}>{t("adminSystemSummarizerTelemetry")}</Typography>
|
||||
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr 1fr", md: "repeat(6, 1fr)" }, gap: 2 }}>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.requests ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache hits</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.cacheHits ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache misses</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.cacheMisses ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.failures ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Probe failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.probeFailures ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.summarizer.averageLatencyMs != null ? `${status.summarizer.averageLatencyMs} ms` : "-"}</Typography></Box>
|
||||
<Box sx={{ display: "grid", gridTemplateColumns: { xs: "1fr 1fr", md: "repeat(8, 1fr)" }, gap: 2 }}>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.requests ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache hits</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.cacheHits ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Cache misses</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.cacheMisses ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.failures ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Probe failures</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.probeFailures ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>Avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.averageLatencyMs != null ? `${status.ai.averageLatencyMs} ms` : "-"}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>OCR requests</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.ocrRequests ?? 0}</Typography></Box>
|
||||
<Box><Typography variant="overline" sx={{ color: "text.secondary" }}>OCR avg latency</Typography><Typography variant="h6" sx={{ fontWeight: 900 }}>{status?.ai.averageOcrLatencyMs != null ? `${status.ai.averageOcrLatencyMs} ms` : "-"}</Typography></Box>
|
||||
</Box>
|
||||
<Box sx={{ display: "flex", gap: 1, flexWrap: "wrap", mt: 2 }}>
|
||||
<Chip label={status?.database.canConnect ? t("adminSystemDatabaseConnected") : t("adminSystemDatabaseIssue")} color={status?.database.canConnect ? "success" : "error"} size="small" />
|
||||
<Chip label={status?.auth.required ? t("adminSystemAuthEnforced") : t("adminSystemAuthOptional")} color={status?.auth.required ? "success" : "warning"} size="small" />
|
||||
<Chip label={status?.auth.googleConfigured ? t("adminSystemGoogleReady") : t("adminSystemGoogleOff")} variant="outlined" size="small" />
|
||||
<Chip label={status?.auth.gmailConfigured ? t("adminSystemGmailReady") : t("adminSystemGmailIncomplete")} variant="outlined" size="small" />
|
||||
<Chip label={status?.summarizer.gpuAvailable ? t("adminSystemGpuVisible") : t("adminSystemCpuMode")} color={status?.summarizer.gpuAvailable ? "success" : "default"} size="small" />
|
||||
<Chip label={status?.ai.gpuAvailable ? t("adminSystemGpuVisible") : t("adminSystemCpuMode")} color={status?.ai.gpuAvailable ? "success" : "default"} size="small" />
|
||||
<Chip label={status?.ai.ocrAvailable ? `OCR ${status.ai.ocrLanguages || "enabled"}` : "OCR unavailable"} variant="outlined" size="small" />
|
||||
</Box>
|
||||
</Paper>
|
||||
</Box>
|
||||
|
||||
@@ -29,7 +29,7 @@ type MeResponse = {
|
||||
} | null;
|
||||
};
|
||||
|
||||
const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown";
|
||||
const CV_UPLOAD_ACCEPT = ".pdf,.docx,.txt,.md,image/png,image/jpeg,image/webp,application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/plain,text/markdown";
|
||||
const AVATAR_UPLOAD_ACCEPT = "image/png,image/jpeg,image/webp";
|
||||
|
||||
function initialsFrom(values: Array<string | undefined>) {
|
||||
|
||||
@@ -9,6 +9,11 @@ jest.mock('./api', () => ({
|
||||
delete: jest.fn(() => Promise.resolve({ data: {} })),
|
||||
interceptors: { request: { use: jest.fn() }, response: { use: jest.fn() } },
|
||||
},
|
||||
getApiErrorMessage: jest.fn((error: any, fallback?: string) => {
|
||||
if (typeof error?.response?.data === 'string' && error.response.data.trim()) return error.response.data;
|
||||
if (typeof error?.message === 'string' && error.message.trim()) return error.message;
|
||||
return fallback || 'Request failed.';
|
||||
}),
|
||||
}));
|
||||
|
||||
jest.mock('./components/GoogleAuthCard', () => () => null);
|
||||
|
||||
@@ -5,6 +5,9 @@ ENV PIP_NO_CACHE_DIR=1 \
|
||||
TRANSFORMERS_NO_TF=1 \
|
||||
HF_HUB_DISABLE_TELEMETRY=1
|
||||
WORKDIR /app
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends tesseract-ocr tesseract-ocr-eng \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
COPY requirements.txt ./
|
||||
RUN python -m pip install --upgrade pip setuptools wheel \
|
||||
&& python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
|
||||
|
||||
+22
-11
@@ -1,16 +1,22 @@
|
||||
# Local Hugging Face Summarizer
|
||||
# Local AI Service
|
||||
|
||||
This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API.
|
||||
This service runs a local Hugging Face summarization model and also exposes document text extraction with OCR for supported PDFs and images.
|
||||
|
||||
Install (recommended: virtualenv)
|
||||
## Capabilities
|
||||
- job/role summarization
|
||||
- PDF text extraction
|
||||
- OCR fallback for scanned PDFs
|
||||
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
|
||||
- DOCX / TXT / MD extraction
|
||||
|
||||
Windows (CPU PyTorch wheel may be required):
|
||||
## Install
|
||||
|
||||
Windows:
|
||||
|
||||
```powershell
|
||||
python -m venv .venv
|
||||
.\.venv\Scripts\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
# If torch wheel installation is needed, follow instructions at https://pytorch.org
|
||||
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
|
||||
```
|
||||
|
||||
@@ -23,10 +29,15 @@ pip install -r requirements.txt
|
||||
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
|
||||
```
|
||||
|
||||
API
|
||||
- `GET /health` — health check
|
||||
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }`
|
||||
## Docker
|
||||
The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can be processed inside the container.
|
||||
|
||||
Notes
|
||||
- Model will be downloaded on first run and can be several hundred MB.
|
||||
- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`.
|
||||
## API
|
||||
- `GET /health` — health check and runtime capabilities
|
||||
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
|
||||
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
|
||||
|
||||
## Notes
|
||||
- Model weights are downloaded on first run.
|
||||
- OCR quality depends on scan quality and language support.
|
||||
- Default OCR language is English (`eng`).
|
||||
|
||||
+107
-3
@@ -1,16 +1,25 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
from pydantic import BaseModel, Field
|
||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
from cachetools import TTLCache
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader
|
||||
from docx import Document
|
||||
import fitz
|
||||
import hashlib
|
||||
import io
|
||||
import re
|
||||
import torch
|
||||
import pytesseract
|
||||
|
||||
app = FastAPI(title="Local Summarizer")
|
||||
app = FastAPI(title="Local AI Service")
|
||||
|
||||
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
|
||||
MAX_INPUT_CHARS = 20000
|
||||
MAX_CONTEXT_CHARS = 2200
|
||||
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
|
||||
OCR_LANGUAGES = "eng"
|
||||
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
||||
|
||||
|
||||
def _load_runtime():
|
||||
@@ -48,6 +57,8 @@ async def health():
|
||||
"device": str(device),
|
||||
"gpu_available": GPU_AVAILABLE,
|
||||
"gpu_name": GPU_NAME,
|
||||
"ocr_available": True,
|
||||
"ocr_languages": OCR_LANGUAGES,
|
||||
}
|
||||
|
||||
|
||||
@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
|
||||
"aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
|
||||
]
|
||||
|
||||
|
||||
_MUST_HAVE_HINTS = [
|
||||
"must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
|
||||
]
|
||||
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
|
||||
out = "\n".join(lines).strip()
|
||||
cache[key] = out
|
||||
return {"summary": out, "cached": False}
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> str:
|
||||
value = value.replace("\x00", " ")
|
||||
return re.sub(r"\s+", " ", value).strip()
|
||||
|
||||
|
||||
def _ocr_image(image: Image.Image) -> str:
|
||||
if image.mode not in ("RGB", "L"):
|
||||
image = image.convert("RGB")
|
||||
text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
|
||||
return _normalize_text(text)
|
||||
|
||||
|
||||
def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
|
||||
page_count = 0
|
||||
extracted_pages = []
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(data))
|
||||
page_count = len(reader.pages)
|
||||
for page in reader.pages:
|
||||
extracted_pages.append(page.extract_text() or "")
|
||||
except Exception:
|
||||
extracted_pages = []
|
||||
|
||||
text = _normalize_text("\n".join(extracted_pages))
|
||||
if len(text) >= 80:
|
||||
return text, False, page_count
|
||||
|
||||
doc = fitz.open(stream=data, filetype="pdf")
|
||||
page_count = doc.page_count
|
||||
ocr_pages = []
|
||||
for page in doc:
|
||||
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
|
||||
image = Image.open(io.BytesIO(pix.tobytes("png")))
|
||||
ocr_pages.append(_ocr_image(image))
|
||||
doc.close()
|
||||
return _normalize_text("\n".join(ocr_pages)), True, page_count
|
||||
|
||||
|
||||
def _extract_docx_text(data: bytes) -> str:
|
||||
document = Document(io.BytesIO(data))
|
||||
parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
|
||||
return _normalize_text("\n".join(parts))
|
||||
|
||||
|
||||
def _extract_plain_text(data: bytes) -> str:
|
||||
return _normalize_text(data.decode("utf-8", errors="ignore"))
|
||||
|
||||
|
||||
@app.post("/extract-text")
|
||||
async def extract_text(file: UploadFile = File(...)):
|
||||
filename = file.filename or "document"
|
||||
extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
|
||||
data = await file.read()
|
||||
if not data:
|
||||
raise HTTPException(status_code=400, detail="The uploaded file was empty.")
|
||||
if len(data) > MAX_EXTRACT_FILE_BYTES:
|
||||
raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
|
||||
|
||||
try:
|
||||
if extension in {".txt", ".md"}:
|
||||
text = _extract_plain_text(data)
|
||||
ocr_used = False
|
||||
page_count = None
|
||||
elif extension == ".docx":
|
||||
text = _extract_docx_text(data)
|
||||
ocr_used = False
|
||||
page_count = None
|
||||
elif extension == ".pdf":
|
||||
text, ocr_used, page_count = _extract_pdf_text(data)
|
||||
elif extension in IMAGE_EXTENSIONS:
|
||||
image = Image.open(io.BytesIO(data))
|
||||
text = _ocr_image(image)
|
||||
ocr_used = True
|
||||
page_count = 1
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
|
||||
|
||||
if not text:
|
||||
raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"ocr_used": ocr_used,
|
||||
"content_type": file.content_type,
|
||||
"page_count": page_count,
|
||||
"characters": len(text),
|
||||
"file_name": filename,
|
||||
}
|
||||
|
||||
@@ -4,3 +4,8 @@ transformers==4.48.3
|
||||
cachetools==5.5.2
|
||||
pydantic==2.10.6
|
||||
torch==2.6.0
|
||||
pillow==11.1.0
|
||||
pytesseract==0.3.13
|
||||
pypdf==5.4.0
|
||||
pymupdf==1.25.5
|
||||
python-docx==1.1.2
|
||||
|
||||
Reference in New Issue
Block a user