Improve CV parsing and profile editor flow

This commit is contained in:
2026-03-29 14:29:18 +02:00
parent 99fc94bc18
commit 44000f96f2
18 changed files with 1028 additions and 44 deletions
+130
View File
@@ -8,9 +8,13 @@ from docx import Document
import fitz
import hashlib
import io
import json
import os
import re
import torch
import pytesseract
from urllib import request as urllib_request
from urllib.error import URLError, HTTPError
app = FastAPI(title="Local AI Service")
@@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
OCR_LANGUAGES = "eng"
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
def _load_runtime():
@@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel):
top_skills: int = Field(default=8, ge=3, le=12)
class CvClassifyBlockRequest(BaseModel):
block: str = Field(min_length=1, max_length=6000)
def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
h = hashlib.sha256(text.encode("utf-8")).hexdigest()
return f"{h}:{max_length}:{min_length}:{top_skills}"
def _ollama_status():
configured = bool(OLLAMA_MODEL)
if not configured:
return {
"ollama_configured": False,
"ollama_reachable": False,
"ollama_model": None,
"ollama_model_available": False,
}
req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET")
try:
with urllib_request.urlopen(req, timeout=5) as response:
body = json.loads(response.read().decode("utf-8"))
except Exception:
return {
"ollama_configured": True,
"ollama_reachable": False,
"ollama_model": OLLAMA_MODEL,
"ollama_model_available": False,
}
models = body.get("models") or []
names = {item.get("name") for item in models if isinstance(item, dict)}
return {
"ollama_configured": True,
"ollama_reachable": True,
"ollama_model": OLLAMA_MODEL,
"ollama_model_available": OLLAMA_MODEL in names,
}
@app.get("/health")
async def health():
return {
@@ -59,6 +101,7 @@ async def health():
"gpu_name": GPU_NAME,
"ocr_available": True,
"ocr_languages": OCR_LANGUAGES,
**_ollama_status(),
}
@@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
def _ollama_generate_json(prompt: str):
if not OLLAMA_MODEL:
raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.")
payload = json.dumps({
"model": OLLAMA_MODEL,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {"temperature": 0.1}
}).encode("utf-8")
req = urllib_request.Request(
f"{OLLAMA_BASE_URL}/api/generate",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib_request.urlopen(req, timeout=30) as response:
body = json.loads(response.read().decode("utf-8"))
except HTTPError as ex:
raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
except URLError as ex:
raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.")
raw = (body.get("response") or "").strip()
if not raw:
raise HTTPException(status_code=502, detail="Ollama returned an empty response.")
try:
return json.loads(raw)
except json.JSONDecodeError:
start = raw.find("{")
end = raw.rfind("}")
if start >= 0 and end > start:
return json.loads(raw[start:end + 1])
raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")
@app.post("/cv/classify-block")
async def classify_cv_block(req: CvClassifyBlockRequest):
prompt = f"""
You classify one CV text block into structured JSON.
Return ONLY valid JSON with this exact shape:
{{
"section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other",
"confidence": 0.0,
"reason": "short reason",
"title": string|null,
"company": string|null,
"location": string|null,
"start": string|null,
"end": string|null,
"bullets": string[]
}}
Rules:
- Preserve facts only.
- section must be one of the listed values.
- Use Work Experience only for job/employment blocks.
- For Contact blocks, keep title/company/start/end null and bullets empty.
- For non-work blocks, title/company/start/end should usually be null.
- location must look like a place, not a sentence.
- dates must be one of: year, month+year, dd/mm/yyyy, Present, Current.
- bullets should only be job tasks/achievements, not titles, companies, dates, or headings.
- If unsure, choose Other and keep fields null/empty.
Block:
{req.block.strip()}
""".strip()
parsed = _ollama_generate_json(prompt)
return {
"section": parsed.get("section") or "Other",
"confidence": parsed.get("confidence"),
"reason": parsed.get("reason"),
"title": parsed.get("title"),
"company": parsed.get("company"),
"location": parsed.get("location"),
"start": parsed.get("start"),
"end": parsed.get("end"),
"bullets": parsed.get("bullets") or [],
}
@app.post("/summarize")
async def summarize(req: SummarizeRequest):
if req.min_length >= req.max_length: