Refactor backend project and tighten CV test coverage

2026-04-01 10:42:55 +02:00
parent 44000f96f2
commit 18d1de45cb
9 changed files with 246 additions and 19 deletions
@@ -26,6 +26,7 @@ OCR_LANGUAGES = "eng"
 IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
 OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
 OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
+SKIP_MODEL_LOAD = os.getenv("AI_SERVICE_SKIP_MODEL_LOAD", "") == "1"


 def _load_runtime():
@@ -39,7 +40,10 @@ def _load_runtime():
    return tokenizer, model, device, has_cuda, gpu_name


-tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = _load_runtime()
+if SKIP_MODEL_LOAD:
+    tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = None, None, torch.device("cpu"), False, None
+else:
+    tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = _load_runtime()
 cache = TTLCache(maxsize=1024, ttl=60 * 60)


@@ -298,6 +302,8 @@ def _role_focused_excerpt(text: str) -> dict:


 def _model_summarize(text: str, max_length: int, min_length: int) -> str:
+    if tokenizer is None or model is None:
+        raise HTTPException(status_code=503, detail="Summarizer model is not loaded.")
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None