First Commit

2026-03-21 11:55:27 +01:00
commit 2e8a29b4d0
1757 changed files with 166084 additions and 0 deletions
@@ -0,0 +1,218 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from cachetools import TTLCache
+import hashlib
+import re
+import torch
+
+app = FastAPI(title="Local Summarizer")
+
+MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+model.eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+cache = TTLCache(maxsize=1024, ttl=60 * 60)  # 1 hour cache
+
+
+class SummarizeRequest(BaseModel):
+    text: str
+    max_length: int = 160
+    min_length: int = 45
+
+
+def _key(text: str, max_length: int, min_length: int) -> str:
+    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
+    return f"{h}:{max_length}:{min_length}"
+
+
+@app.get("/health")
+async def health():
+    return {"ok": True, "model": MODEL_NAME}
+
+
+_TECH = [
+    "python",
+    "c#",
+    "dotnet",
+    ".net",
+    "java",
+    "javascript",
+    "typescript",
+    "react",
+    "node",
+    "sql",
+    "postgres",
+    "postgresql",
+    "mysql",
+    "sqlite",
+    "mongodb",
+    "redis",
+    "aws",
+    "azure",
+    "gcp",
+    "docker",
+    "kubernetes",
+    "terraform",
+    "linux",
+    "git",
+    "ci/cd",
+    "graphql",
+    "rest",
+]
+
+
+def _strip_html(text: str) -> str:
+    # Good enough for job descriptions pasted from the web.
+    text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
+    text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
+    text = re.sub(r"<[^>]+>", " ", text)
+    return re.sub(r"\n{3,}", "\n\n", text).strip()
+
+
+def _extract_bullets(lines, max_items=8):
+    out = []
+    for ln in lines:
+        s = ln.strip()
+        if not s:
+            continue
+        if re.match(r"^([-*]|\u2022)\s+", s):
+            s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
+            if 3 <= len(s) <= 200:
+                out.append(s)
+        if len(out) >= max_items:
+            break
+    return out
+
+
+def _role_focused_excerpt(text: str) -> dict:
+    cleaned = _strip_html(text)
+    lines = [ln.strip() for ln in cleaned.splitlines()]
+
+    headings = {
+        "responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
+        "requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
+        "nice": ["nice to have", "bonus", "preferred"],
+    }
+
+    def match_heading(s: str):
+        sl = s.lower().strip(":- ")
+        for k, words in headings.items():
+            for w in words:
+                if sl == w or sl.startswith(w + " "):
+                    return k
+        return None
+
+    section = None
+    resp_lines = []
+    req_lines = []
+    nice_lines = []
+
+    for ln in lines:
+        if not ln:
+            continue
+        h = match_heading(ln)
+        if h:
+            section = h
+            continue
+
+        if section == "responsibilities":
+            resp_lines.append(ln)
+        elif section == "requirements":
+            req_lines.append(ln)
+        elif section == "nice":
+            nice_lines.append(ln)
+
+    responsibilities = _extract_bullets(resp_lines, max_items=7)
+    requirements = _extract_bullets(req_lines, max_items=7)
+    nice = _extract_bullets(nice_lines, max_items=5)
+
+    tech_found = []
+    low = cleaned.lower()
+    for t in _TECH:
+        if t in low:
+            tech_found.append(t)
+
+    # Fallback: pick bullet-like lines anywhere if sections are missing.
+    if not responsibilities and not requirements:
+        any_bullets = _extract_bullets(lines, max_items=10)
+        responsibilities = any_bullets[:6]
+        requirements = any_bullets[6:10]
+
+    focused_parts = []
+    if responsibilities:
+        focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
+    if requirements:
+        focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
+    if nice:
+        focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
+
+    # Always include a small slice of the original for context.
+    focused_parts.append("Context:\n" + cleaned[:1500])
+
+    return {
+        "cleaned": cleaned,
+        "focused_input": "\n\n".join(focused_parts),
+        "responsibilities": responsibilities,
+        "requirements": requirements,
+        "nice": nice,
+        "tech": tech_found,
+    }
+
+
+def _model_summarize(text: str, max_length: int, min_length: int) -> str:
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
+    input_ids = inputs.input_ids.to(device)
+    attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_length=max_length,
+            min_length=min_length,
+            num_beams=4,
+            early_stopping=True,
+        )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+
+
+@app.post("/summarize")
+async def summarize(req: SummarizeRequest):
+    key = _key(req.text, req.max_length, req.min_length)
+    if key in cache:
+        return {"summary": cache[key], "cached": True}
+
+    info = _role_focused_excerpt(req.text)
+
+    # Summarize the role-focused excerpt instead of the whole job post.
+    summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)
+
+    lines = ["Role summary:", summary]
+
+    if info["responsibilities"]:
+        lines.append("")
+        lines.append("Key responsibilities:")
+        for x in info["responsibilities"][:6]:
+            lines.append(f"- {x}")
+
+    if info["requirements"]:
+        lines.append("")
+        lines.append("Key requirements:")
+        for x in info["requirements"][:6]:
+            lines.append(f"- {x}")
+
+    if info["tech"]:
+        # Keep this short; it's just a hint based on keyword matches.
+        uniq = []
+        for t in info["tech"]:
+            if t not in uniq:
+                uniq.append(t)
+        lines.append("")
+        lines.append("Tech keywords: " + ", ".join(uniq[:14]))
+
+    out = "\n".join(lines).strip()
+    cache[key] = out
+    return {"summary": out, "cached": False}