refactor, security updates, cv extraction upgrades

2026-04-11 01:34:32 +02:00
parent 806b200ac5
commit 27fd70a2d7
59 changed files with 6817 additions and 1561 deletions
@@ -0,0 +1,10 @@
+.venv
+__pycache__
+.pytest_cache
+tmp
+*.out
+*.err
+*.pyc
+*.pyo
+*.pyd
+.pytest_cache/
@@ -54,7 +54,7 @@ The script:
 - writes pytest cache under `tmp/pytest-cache` to avoid stale root-owned `.pytest_cache` directories

 ## API
- `GET /health` — health check and runtime capabilities, including Ollama version/model metadata when configured
+- `GET /health` — health check and runtime capabilities, including lazy model state (`model_loaded`, `model_disabled`, `summarize_available`, `model_load_error`) plus Ollama version/model metadata when configured
 - `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
 - `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
 - `POST /cv/classify-block` — JSON body `{ "block": "..." }`, uses Ollama when `OLLAMA_MODEL` is configured
@@ -27,6 +27,17 @@ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
 OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
 OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
 SKIP_MODEL_LOAD = os.getenv("AI_SERVICE_SKIP_MODEL_LOAD", "") == "1"
+EAGER_MODEL_LOAD = os.getenv("AI_SERVICE_EAGER_MODEL_LOAD", "") == "1"
+
+
+tokenizer = None
+model = None
+device = torch.device("cpu")
+GPU_AVAILABLE = False
+GPU_NAME = None
+MODEL_LOAD_ERROR = "Model loading is disabled by AI_SERVICE_SKIP_MODEL_LOAD." if SKIP_MODEL_LOAD else None
+MODEL_LOADED = False
+MODEL_DISABLED = SKIP_MODEL_LOAD


 def _load_runtime():
@@ -40,10 +51,31 @@ def _load_runtime():
    return tokenizer, model, device, has_cuda, gpu_name


-if SKIP_MODEL_LOAD:
-    tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = None, None, torch.device("cpu"), False, None
-else:
-    tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = _load_runtime()
+def _ensure_runtime_loaded():
+    global tokenizer, model, device, GPU_AVAILABLE, GPU_NAME, MODEL_LOAD_ERROR, MODEL_LOADED
+    if MODEL_DISABLED:
+        MODEL_LOAD_ERROR = "Model loading is disabled by AI_SERVICE_SKIP_MODEL_LOAD."
+        return False
+    if MODEL_LOADED and tokenizer is not None and model is not None:
+        return True
+    try:
+        tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = _load_runtime()
+        MODEL_LOAD_ERROR = None
+        MODEL_LOADED = True
+        return True
+    except Exception as exc:
+        tokenizer, model = None, None
+        device = torch.device("cpu")
+        GPU_AVAILABLE = False
+        GPU_NAME = None
+        MODEL_LOADED = False
+        MODEL_LOAD_ERROR = str(exc)
+        return False
+
+
+if EAGER_MODEL_LOAD and not SKIP_MODEL_LOAD:
+    _ensure_runtime_loaded()
+
 cache = TTLCache(maxsize=1024, ttl=60 * 60)


@@ -54,6 +86,10 @@ class SummarizeRequest(BaseModel):
    top_skills: int = Field(default=8, ge=3, le=12)


+class CvNormalizeRequest(BaseModel):
+    text: str = Field(min_length=1, max_length=50000)
+
+
 class CvClassifyBlockRequest(BaseModel):
    block: str = Field(min_length=1, max_length=6000)

@@ -127,6 +163,10 @@ async def health():
        "gpu_name": GPU_NAME,
        "ocr_available": True,
        "ocr_languages": OCR_LANGUAGES,
+        "model_loaded": MODEL_LOADED,
+        "model_disabled": MODEL_DISABLED,
+        "summarize_available": MODEL_LOADED and not MODEL_DISABLED,
+        "model_load_error": MODEL_LOAD_ERROR,
        **_ollama_status(),
    }

@@ -324,8 +364,8 @@ def _role_focused_excerpt(text: str) -> dict:


 def _model_summarize(text: str, max_length: int, min_length: int) -> str:
-    if tokenizer is None or model is None:
-        raise HTTPException(status_code=503, detail="Summarizer model is not loaded.")
+    if not _ensure_runtime_loaded() or tokenizer is None or model is None:
+        raise HTTPException(status_code=503, detail=MODEL_LOAD_ERROR or "Summarizer model is not loaded.")
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
@@ -363,7 +403,7 @@ def _ollama_generate_json(prompt: str):
    )

    try:
-        with urllib_request.urlopen(req, timeout=30) as response:
+        with urllib_request.urlopen(req, timeout=120) as response:
            body = json.loads(response.read().decode("utf-8"))
    except HTTPError as ex:
        raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
@@ -384,6 +424,64 @@ def _ollama_generate_json(prompt: str):
        raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")


+@app.post("/cv/normalize")
+async def normalize_cv(req: CvNormalizeRequest):
+    prompt = f"""
+You normalize messy CV text into parser-friendly master-CV text.
+Return ONLY valid JSON with this exact shape:
+{{
+  "confidence": 0.0,
+  "reason": "short reason",
+  "normalized_text": "string"
+}}
+
+Rules for normalized_text:
+- Preserve facts only. Do not invent.
+- Use markdown section headings exactly like these when data exists:
+  # Contact
+  # Professional Summary
+  # Work Experience
+  # Education
+  # Skills
+  # Languages
+  # Interests
+- Under # Contact, put one plain value per line, no labels unless unavoidable:
+  Full name line
+  email line
+  phone line
+  website line
+  location line
+- Under # Professional Summary, write 1-3 plain sentences or bullet lines.
+- Preserve explicitly mentioned technologies, tools, and methods as skills when they appear in the source.
+- Never output helper words like "line", "value", "field", or "item".
+- Under # Work Experience, for each job use this exact shape:
+  Job title only
+  Company, Location
+  2019 - Present
+  - bullet
+  - bullet
+- Under # Education, for each entry use this exact shape:
+  Qualification line
+  Institution, Location line
+  2016 - 2019 line
+  - detail
+- Under # Skills and # Languages, use one bullet per item.
+- Remove OCR/layout noise.
+- Do not output placeholders like Not specified.
+- If uncertain, omit the field/line rather than invent.
+
+CV text:
+{req.text.strip()}
+""".strip()
+
+    parsed = _ollama_generate_json(prompt)
+    return {
+        "confidence": parsed.get("confidence"),
+        "reason": parsed.get("reason"),
+        "normalized_text": parsed.get("normalized_text"),
+    }
+
+
@app.post("/cv/classify-block")
 async def classify_cv_block(req: CvClassifyBlockRequest):
    prompt = f"""
@@ -1,5 +1,4 @@
 import importlib
-import os
 import sys
 from pathlib import Path

@@ -11,16 +10,23 @@ if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))


-def load_app_module(monkeypatch):
-    monkeypatch.setenv("AI_SERVICE_SKIP_MODEL_LOAD", "1")
-    monkeypatch.delenv("OLLAMA_MODEL", raising=False)
+def load_app_module(monkeypatch, *, skip_model_load=True, ollama_model=None):
+    if skip_model_load:
+        monkeypatch.setenv("AI_SERVICE_SKIP_MODEL_LOAD", "1")
+    else:
+        monkeypatch.delenv("AI_SERVICE_SKIP_MODEL_LOAD", raising=False)
+    monkeypatch.delenv("AI_SERVICE_EAGER_MODEL_LOAD", raising=False)
+    if ollama_model is None:
+        monkeypatch.delenv("OLLAMA_MODEL", raising=False)
+    else:
+        monkeypatch.setenv("OLLAMA_MODEL", ollama_model)
    if "app" in sys.modules:
        del sys.modules["app"]
    module = importlib.import_module("app")
    return importlib.reload(module)


-def test_health_reports_runtime_without_ollama(monkeypatch):
+def test_health_reports_runtime_without_ollama_and_without_forcing_model_load(monkeypatch):
    module = load_app_module(monkeypatch)
    client = TestClient(module.app)

@@ -30,12 +36,46 @@ def test_health_reports_runtime_without_ollama(monkeypatch):
    payload = response.json()
    assert payload["ok"] is True
    assert payload["device"] == "cpu"
+    assert payload["model_loaded"] is False
+    assert payload["model_disabled"] is True
+    assert payload["summarize_available"] is False
+    assert "disabled" in payload["model_load_error"].lower()
    assert payload["ollama_configured"] is False
    assert payload["ollama_model"] is None
    assert payload["ollama_installed_models"] == []
    assert payload["ollama_loaded_models"] == []


+def test_summarize_returns_503_with_explicit_reason_when_model_loading_is_disabled(monkeypatch):
+    module = load_app_module(monkeypatch)
+    client = TestClient(module.app)
+
+    response = client.post("/summarize", json={"text": "Platform engineering role with APIs and Python experience."})
+
+    assert response.status_code == 503
+    payload = response.json()
+    assert "disabled" in payload["detail"].lower()
+
+
+def test_health_reports_ollama_unreachable_when_configured_but_not_available(monkeypatch):
+    module = load_app_module(monkeypatch, ollama_model="qwen2.5:7b")
+
+    def boom(path: str):
+        raise OSError("connection refused")
+
+    monkeypatch.setattr(module, "_ollama_json", boom)
+    client = TestClient(module.app)
+
+    response = client.get("/health")
+
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["ollama_configured"] is True
+    assert payload["ollama_reachable"] is False
+    assert payload["ollama_model"] == "qwen2.5:7b"
+    assert payload["ollama_model_available"] is False
+
+
 def test_classify_block_returns_structured_json(monkeypatch):
    module = load_app_module(monkeypatch)