feat: improve admin observability and translation-first summaries

2026-03-22 21:37:30 +01:00
parent 8014c1e890
commit 4c49ffb0d6
8 changed files with 585 additions and 261 deletions
@@ -10,12 +10,21 @@ app = FastAPI(title="Local Summarizer")

 MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
 MAX_INPUT_CHARS = 20000
+MAX_CONTEXT_CHARS = 2200

-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
-model.eval()
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
+
+def _load_runtime():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+    model.eval()
+    has_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if has_cuda else "cpu")
+    model.to(device)
+    gpu_name = torch.cuda.get_device_name(0) if has_cuda else None
+    return tokenizer, model, device, has_cuda, gpu_name
+
+
+tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = _load_runtime()
 cache = TTLCache(maxsize=1024, ttl=60 * 60)


@@ -33,7 +42,13 @@ def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:

@app.get("/health")
 async def health():
-    return {"ok": True, "model": MODEL_NAME}
+    return {
+        "ok": True,
+        "model": MODEL_NAME,
+        "device": str(device),
+        "gpu_available": GPU_AVAILABLE,
+        "gpu_name": GPU_NAME,
+    }


 _TECH = [
@@ -54,6 +69,17 @@ _TECH_PRIORITY = [
 ]


+_MUST_HAVE_HINTS = [
+    "must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
+]
+_NICE_TO_HAVE_HINTS = [
+    "nice to have", "bonus", "preferred", "advantageous", "extra plus",
+]
+_SCREENING_HINTS = [
+    "experience with", "hands-on", "demonstrated", "proven", "track record", "delivered",
+]
+
+
 def _rank_tech_skills(skills):
    ordered = []
    seen = set()
@@ -84,7 +110,7 @@ def _extract_bullets(lines, max_items=8):
            continue
        if re.match(r"^([-*]|\u2022)\s+", s):
            s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
-            if 3 <= len(s) <= 200:
+            if 3 <= len(s) <= 220:
                out.append(s)
        if len(out) >= max_items:
            break
@@ -96,6 +122,7 @@ def _top_keywords(text: str, limit=6):
    stop = {
        "with", "from", "that", "this", "will", "have", "your", "their", "about", "role", "team", "work",
        "experience", "skills", "requirements", "responsibilities", "company", "using", "ability", "years",
+        "looking", "candidate", "position", "working", "across", "strong", "building", "support",
    }
    counts = {}
    for word in words:
@@ -106,6 +133,27 @@ def _top_keywords(text: str, limit=6):
    return [word for word, _ in ordered[:limit]]


+def _first_matching_sentences(text: str, hints, limit=3):
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    found = []
+    for sentence in sentences:
+        low = sentence.lower()
+        if any(hint in low for hint in hints):
+            cleaned = sentence.strip()
+            if 20 <= len(cleaned) <= 220:
+                found.append(cleaned)
+        if len(found) >= limit:
+            break
+    return found
+
+
+def _trim_line(text: str, max_len: int = 140) -> str:
+    text = re.sub(r"\s+", " ", text).strip(" -•\t")
+    if len(text) <= max_len:
+        return text
+    return text[: max_len - 1].rstrip() + "…"
+
+
 def _role_focused_excerpt(text: str) -> dict:
    cleaned = _strip_html(text)
    lines = [ln.strip() for ln in cleaned.splitlines()]
@@ -162,6 +210,11 @@ def _role_focused_excerpt(text: str) -> dict:
        responsibilities = any_bullets[:6]
        requirements = any_bullets[6:10]

+    if not requirements:
+        requirements = [_trim_line(x) for x in _first_matching_sentences(cleaned, _MUST_HAVE_HINTS, limit=4)]
+    if not nice:
+        nice = [_trim_line(x) for x in _first_matching_sentences(cleaned, _NICE_TO_HAVE_HINTS, limit=3)]
+
    focused_parts = []
    if responsibilities:
        focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
@@ -169,7 +222,14 @@ def _role_focused_excerpt(text: str) -> dict:
        focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
    if nice:
        focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
-    focused_parts.append("Context:\n" + cleaned[:1500])
+    focused_parts.append("Context:\n" + cleaned[:MAX_CONTEXT_CHARS])
+
+    screen_focus = []
+    for item in requirements[:4]:
+        if any(hint in item.lower() for hint in _SCREENING_HINTS) or len(screen_focus) < 2:
+            screen_focus.append(_trim_line(item))
+    if not screen_focus:
+        screen_focus = [_trim_line(x) for x in _first_matching_sentences(cleaned, _SCREENING_HINTS, limit=3)]

    return {
        "cleaned": cleaned,
@@ -180,6 +240,7 @@ def _role_focused_excerpt(text: str) -> dict:
        "tech": tech_found,
        "soft": soft_found,
        "keywords": _top_keywords(cleaned),
+        "screen_focus": screen_focus[:3],
    }


@@ -193,7 +254,9 @@ def _model_summarize(text: str, max_length: int, min_length: int) -> str:
            attention_mask=attention_mask,
            max_length=max_length,
            min_length=min_length,
-            num_beams=4,
+            num_beams=3,
+            length_penalty=1.0,
+            no_repeat_ngram_size=3,
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
@@ -211,53 +274,64 @@ async def summarize(req: SummarizeRequest):
    info = _role_focused_excerpt(req.text)
    summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)

+    ranked_tech = []
+    for t in _rank_tech_skills(info["tech"]):
+        if t not in ranked_tech:
+            ranked_tech.append(t)
+
+    uniq_soft = []
+    for s in info["soft"]:
+        if s not in uniq_soft:
+            uniq_soft.append(s)
+
    lines = ["Role summary:", summary]

    if info["requirements"]:
        lines.append("")
        lines.append("What the company wants most:")
-        for x in info["requirements"][:7]:
-            lines.append(f"- {x}")
+        for x in info["requirements"][:5]:
+            lines.append(f"- {_trim_line(x)}")
+
+    if ranked_tech:
+        lines.append("")
+        lines.append("Top hard skills:")
+        for skill in ranked_tech[: req.top_skills]:
+            lines.append(f"- {skill}")
+
+    if info["keywords"]:
+        lines.append("")
+        lines.append("Keywords to mirror:")
+        for keyword in info["keywords"][:5]:
+            lines.append(f"- {keyword}")

    if info["responsibilities"]:
        lines.append("")
        lines.append("What you would be doing:")
-        for x in info["responsibilities"][:6]:
-            lines.append(f"- {x}")
+        for x in info["responsibilities"][:4]:
+            lines.append(f"- {_trim_line(x)}")

    if info["nice"]:
        lines.append("")
        lines.append("Nice to have:")
-        for x in info["nice"][:5]:
-            lines.append(f"- {x}")
+        for x in info["nice"][:3]:
+            lines.append(f"- {_trim_line(x)}")

-    if info["tech"]:
-        uniq = []
-        for t in _rank_tech_skills(info["tech"]):
-            if t not in uniq:
-                uniq.append(t)
+    if uniq_soft:
        lines.append("")
-        lines.append("Top hard skills: " + ", ".join(uniq[: req.top_skills]))
-
-    if info["soft"]:
-        uniq_soft = []
-        for s in info["soft"]:
-            if s not in uniq_soft:
-                uniq_soft.append(s)
-        lines.append("")
-        lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))
-
-    if info["keywords"]:
-        lines.append("")
-        lines.append("Key themes: " + ", ".join(info["keywords"][:6]))
+        lines.append("Relevant soft skills:")
+        for soft in uniq_soft[:5]:
+            lines.append(f"- {soft}")

    lines.append("")
    lines.append("Interview focus:")
-    if info["requirements"]:
+    if info["screen_focus"]:
+        for x in info["screen_focus"]:
+            lines.append(f"- Be ready to prove: {_trim_line(x)}")
+    elif info["requirements"]:
        for x in info["requirements"][:3]:
-            lines.append(f"- Prepare examples that demonstrate: {x}")
-    elif info["tech"]:
-        for x in _rank_tech_skills(info["tech"])[:3]:
+            lines.append(f"- Prepare examples that demonstrate: {_trim_line(x)}")
+    elif ranked_tech:
+        for x in ranked_tech[:3]:
            lines.append(f"- Be ready to explain your hands-on experience with {x}")
    else:
        lines.append("- Prepare examples showing relevant impact, collaboration, and delivery.")