Improve CV parsing and profile editor flow

2026-03-29 14:29:18 +02:00
parent 99fc94bc18
commit 44000f96f2
18 changed files with 1028 additions and 44 deletions
@@ -8,9 +8,13 @@ from docx import Document
 import fitz
 import hashlib
 import io
+import json
+import os
 import re
 import torch
 import pytesseract
+from urllib import request as urllib_request
+from urllib.error import URLError, HTTPError

 app = FastAPI(title="Local AI Service")

@@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200
 MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
 OCR_LANGUAGES = "eng"
 IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
+OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")


 def _load_runtime():
@@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel):
    top_skills: int = Field(default=8, ge=3, le=12)


+class CvClassifyBlockRequest(BaseModel):
+    block: str = Field(min_length=1, max_length=6000)
+
+
 def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
    return f"{h}:{max_length}:{min_length}:{top_skills}"


+def _ollama_status():
+    configured = bool(OLLAMA_MODEL)
+    if not configured:
+        return {
+            "ollama_configured": False,
+            "ollama_reachable": False,
+            "ollama_model": None,
+            "ollama_model_available": False,
+        }
+
+    req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET")
+    try:
+        with urllib_request.urlopen(req, timeout=5) as response:
+            body = json.loads(response.read().decode("utf-8"))
+    except Exception:
+        return {
+            "ollama_configured": True,
+            "ollama_reachable": False,
+            "ollama_model": OLLAMA_MODEL,
+            "ollama_model_available": False,
+        }
+
+    models = body.get("models") or []
+    names = {item.get("name") for item in models if isinstance(item, dict)}
+    return {
+        "ollama_configured": True,
+        "ollama_reachable": True,
+        "ollama_model": OLLAMA_MODEL,
+        "ollama_model_available": OLLAMA_MODEL in names,
+    }
+
+
@app.get("/health")
 async def health():
    return {
@@ -59,6 +101,7 @@ async def health():
        "gpu_name": GPU_NAME,
        "ocr_available": True,
        "ocr_languages": OCR_LANGUAGES,
+        **_ollama_status(),
    }


@@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()


+def _ollama_generate_json(prompt: str):
+    if not OLLAMA_MODEL:
+        raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.")
+
+    payload = json.dumps({
+        "model": OLLAMA_MODEL,
+        "prompt": prompt,
+        "stream": False,
+        "format": "json",
+        "options": {"temperature": 0.1}
+    }).encode("utf-8")
+
+    req = urllib_request.Request(
+        f"{OLLAMA_BASE_URL}/api/generate",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+
+    try:
+        with urllib_request.urlopen(req, timeout=30) as response:
+            body = json.loads(response.read().decode("utf-8"))
+    except HTTPError as ex:
+        raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
+    except URLError as ex:
+        raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.")
+
+    raw = (body.get("response") or "").strip()
+    if not raw:
+        raise HTTPException(status_code=502, detail="Ollama returned an empty response.")
+
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        start = raw.find("{")
+        end = raw.rfind("}")
+        if start >= 0 and end > start:
+            return json.loads(raw[start:end + 1])
+        raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")
+
+
+@app.post("/cv/classify-block")
+async def classify_cv_block(req: CvClassifyBlockRequest):
+    prompt = f"""
+You classify one CV text block into structured JSON.
+Return ONLY valid JSON with this exact shape:
+{{
+  "section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other",
+  "confidence": 0.0,
+  "reason": "short reason",
+  "title": string|null,
+  "company": string|null,
+  "location": string|null,
+  "start": string|null,
+  "end": string|null,
+  "bullets": string[]
+}}
+
+Rules:
+- Preserve facts only.
+- section must be one of the listed values.
+- Use Work Experience only for job/employment blocks.
+- For Contact blocks, keep title/company/start/end null and bullets empty.
+- For non-work blocks, title/company/start/end should usually be null.
+- location must look like a place, not a sentence.
+- dates must be one of: year, month+year, dd/mm/yyyy, Present, Current.
+- bullets should only be job tasks/achievements, not titles, companies, dates, or headings.
+- If unsure, choose Other and keep fields null/empty.
+
+Block:
+{req.block.strip()}
+""".strip()
+
+    parsed = _ollama_generate_json(prompt)
+    return {
+        "section": parsed.get("section") or "Other",
+        "confidence": parsed.get("confidence"),
+        "reason": parsed.get("reason"),
+        "title": parsed.get("title"),
+        "company": parsed.get("company"),
+        "location": parsed.get("location"),
+        "start": parsed.get("start"),
+        "end": parsed.get("end"),
+        "bullets": parsed.get("bullets") or [],
+    }
+
+
@app.post("/summarize")
 async def summarize(req: SummarizeRequest):
    if req.min_length >= req.max_length: