Improve CV parsing and profile editor flow
This commit is contained in:
@@ -8,9 +8,13 @@ from docx import Document
|
||||
import fitz
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
import pytesseract
|
||||
from urllib import request as urllib_request
|
||||
from urllib.error import URLError, HTTPError
|
||||
|
||||
app = FastAPI(title="Local AI Service")
|
||||
|
||||
@@ -20,6 +24,8 @@ MAX_CONTEXT_CHARS = 2200
|
||||
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
|
||||
OCR_LANGUAGES = "eng"
|
||||
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
|
||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
|
||||
|
||||
|
||||
def _load_runtime():
|
||||
@@ -44,11 +50,47 @@ class SummarizeRequest(BaseModel):
|
||||
top_skills: int = Field(default=8, ge=3, le=12)
|
||||
|
||||
|
||||
class CvClassifyBlockRequest(BaseModel):
|
||||
block: str = Field(min_length=1, max_length=6000)
|
||||
|
||||
|
||||
def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
|
||||
h = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
return f"{h}:{max_length}:{min_length}:{top_skills}"
|
||||
|
||||
|
||||
def _ollama_status():
|
||||
configured = bool(OLLAMA_MODEL)
|
||||
if not configured:
|
||||
return {
|
||||
"ollama_configured": False,
|
||||
"ollama_reachable": False,
|
||||
"ollama_model": None,
|
||||
"ollama_model_available": False,
|
||||
}
|
||||
|
||||
req = urllib_request.Request(f"{OLLAMA_BASE_URL}/api/tags", method="GET")
|
||||
try:
|
||||
with urllib_request.urlopen(req, timeout=5) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except Exception:
|
||||
return {
|
||||
"ollama_configured": True,
|
||||
"ollama_reachable": False,
|
||||
"ollama_model": OLLAMA_MODEL,
|
||||
"ollama_model_available": False,
|
||||
}
|
||||
|
||||
models = body.get("models") or []
|
||||
names = {item.get("name") for item in models if isinstance(item, dict)}
|
||||
return {
|
||||
"ollama_configured": True,
|
||||
"ollama_reachable": True,
|
||||
"ollama_model": OLLAMA_MODEL,
|
||||
"ollama_model_available": OLLAMA_MODEL in names,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {
|
||||
@@ -59,6 +101,7 @@ async def health():
|
||||
"gpu_name": GPU_NAME,
|
||||
"ocr_available": True,
|
||||
"ocr_languages": OCR_LANGUAGES,
|
||||
**_ollama_status(),
|
||||
}
|
||||
|
||||
|
||||
@@ -272,6 +315,93 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
|
||||
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
|
||||
|
||||
|
||||
def _ollama_generate_json(prompt: str):
|
||||
if not OLLAMA_MODEL:
|
||||
raise HTTPException(status_code=503, detail="OLLAMA_MODEL is not configured.")
|
||||
|
||||
payload = json.dumps({
|
||||
"model": OLLAMA_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"temperature": 0.1}
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib_request.Request(
|
||||
f"{OLLAMA_BASE_URL}/api/generate",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib_request.urlopen(req, timeout=30) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except HTTPError as ex:
|
||||
raise HTTPException(status_code=502, detail=f"Ollama request failed with {ex.code}.")
|
||||
except URLError as ex:
|
||||
raise HTTPException(status_code=503, detail=f"Ollama is unreachable: {ex.reason}.")
|
||||
|
||||
raw = (body.get("response") or "").strip()
|
||||
if not raw:
|
||||
raise HTTPException(status_code=502, detail="Ollama returned an empty response.")
|
||||
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}")
|
||||
if start >= 0 and end > start:
|
||||
return json.loads(raw[start:end + 1])
|
||||
raise HTTPException(status_code=502, detail="Ollama did not return valid JSON.")
|
||||
|
||||
|
||||
@app.post("/cv/classify-block")
|
||||
async def classify_cv_block(req: CvClassifyBlockRequest):
|
||||
prompt = f"""
|
||||
You classify one CV text block into structured JSON.
|
||||
Return ONLY valid JSON with this exact shape:
|
||||
{{
|
||||
"section": "Contact|Professional Summary|Work Experience|Education|Skills|Languages|Interests|Other",
|
||||
"confidence": 0.0,
|
||||
"reason": "short reason",
|
||||
"title": string|null,
|
||||
"company": string|null,
|
||||
"location": string|null,
|
||||
"start": string|null,
|
||||
"end": string|null,
|
||||
"bullets": string[]
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- Preserve facts only.
|
||||
- section must be one of the listed values.
|
||||
- Use Work Experience only for job/employment blocks.
|
||||
- For Contact blocks, keep title/company/start/end null and bullets empty.
|
||||
- For non-work blocks, title/company/start/end should usually be null.
|
||||
- location must look like a place, not a sentence.
|
||||
- dates must be one of: year, month+year, dd/mm/yyyy, Present, Current.
|
||||
- bullets should only be job tasks/achievements, not titles, companies, dates, or headings.
|
||||
- If unsure, choose Other and keep fields null/empty.
|
||||
|
||||
Block:
|
||||
{req.block.strip()}
|
||||
""".strip()
|
||||
|
||||
parsed = _ollama_generate_json(prompt)
|
||||
return {
|
||||
"section": parsed.get("section") or "Other",
|
||||
"confidence": parsed.get("confidence"),
|
||||
"reason": parsed.get("reason"),
|
||||
"title": parsed.get("title"),
|
||||
"company": parsed.get("company"),
|
||||
"location": parsed.get("location"),
|
||||
"start": parsed.get("start"),
|
||||
"end": parsed.get("end"),
|
||||
"bullets": parsed.get("bullets") or [],
|
||||
}
|
||||
|
||||
|
||||
@app.post("/summarize")
|
||||
async def summarize(req: SummarizeRequest):
|
||||
if req.min_length >= req.max_length:
|
||||
|
||||
Reference in New Issue
Block a user