jobtrackingapp/tools/summarizer/app.py

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache
import hashlib
import re
import torch

app = FastAPI(title="Local Summarizer")

MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000

# The local summarizer is intentionally simple, but we still validate request sizes
# so accidental giant pastes do not cause avoidable latency or memory spikes.
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
cache = TTLCache(maxsize=1024, ttl=60 * 60)  # 1 hour cache


class SummarizeRequest(BaseModel):
    text: str = Field(min_length=1, max_length=MAX_INPUT_CHARS)
    max_length: int = Field(default=160, ge=24, le=256)
    min_length: int = Field(default=45, ge=8, le=180)


def _key(text: str, max_length: int, min_length: int) -> str:
    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
    return f"{h}:{max_length}:{min_length}"


@app.get("/health")
async def health():
    return {"ok": True, "model": MODEL_NAME}


_TECH = [
    "python",
    "c#",
    "dotnet",
    ".net",
    "java",
    "javascript",
    "typescript",
    "react",
    "node",
    "sql",
    "postgres",
    "postgresql",
    "mysql",
    "sqlite",
    "mongodb",
    "redis",
    "aws",
    "azure",
    "gcp",
    "docker",
    "kubernetes",
    "terraform",
    "linux",
    "git",
    "ci/cd",
    "graphql",
    "rest",
]

_SOFT = [
    "communication",
    "collaboration",
    "teamwork",
    "problem solving",
    "leadership",
    "mentoring",
    "ownership",
    "initiative",
    "adaptability",
    "stakeholder management",
    "detail oriented",
]


def _strip_html(text: str) -> str:
    # Good enough for job descriptions pasted from the web.
    text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"<[^>]+>", " ", text)
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _extract_bullets(lines, max_items=8):
    out = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        if re.match(r"^([-*]|\u2022)\s+", s):
            s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
            if 3 <= len(s) <= 200:
                out.append(s)
        if len(out) >= max_items:
            break
    return out


def _role_focused_excerpt(text: str) -> dict:
    cleaned = _strip_html(text)
    lines = [ln.strip() for ln in cleaned.splitlines()]

    headings = {
        "responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
        "requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
        "nice": ["nice to have", "bonus", "preferred"],
    }

    def match_heading(s: str):
        sl = s.lower().strip(":-\x7f ")
        for k, words in headings.items():
            for w in words:
                if sl == w or sl.startswith(w + " "):
                    return k
        return None

    section = None
    resp_lines = []
    req_lines = []
    nice_lines = []

    for ln in lines:
        if not ln:
            continue
        h = match_heading(ln)
        if h:
            section = h
            continue

        if section == "responsibilities":
            resp_lines.append(ln)
        elif section == "requirements":
            req_lines.append(ln)
        elif section == "nice":
            nice_lines.append(ln)

    responsibilities = _extract_bullets(resp_lines, max_items=7)
    requirements = _extract_bullets(req_lines, max_items=7)
    nice = _extract_bullets(nice_lines, max_items=5)

    tech_found = []
    soft_found = []
    low = cleaned.lower()
    for t in _TECH:
        if t in low:
            tech_found.append(t)
    for s in _SOFT:
        if s in low:
            soft_found.append(s)

    # Fallback: pick bullet-like lines anywhere if sections are missing.
    if not responsibilities and not requirements:
        any_bullets = _extract_bullets(lines, max_items=10)
        responsibilities = any_bullets[:6]
        requirements = any_bullets[6:10]

    focused_parts = []
    if responsibilities:
        focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
    if requirements:
        focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
    if nice:
        focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))

    # Always include a small slice of the original for context.
    focused_parts.append("Context:\n" + cleaned[:1500])

    return {
        "cleaned": cleaned,
        "focused_input": "\n\n".join(focused_parts),
        "responsibilities": responsibilities,
        "requirements": requirements,
        "nice": nice,
        "tech": tech_found,
        "soft": soft_found,
    }


def _model_summarize(text: str, max_length: int, min_length: int) -> str:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            min_length=min_length,
            num_beams=4,
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()


@app.post("/summarize")
async def summarize(req: SummarizeRequest):
    if req.min_length >= req.max_length:
        raise HTTPException(status_code=400, detail="min_length must be smaller than max_length.")

    key = _key(req.text, req.max_length, req.min_length)
    if key in cache:
        return {"summary": cache[key], "cached": True}

    info = _role_focused_excerpt(req.text)

    # Summarize the role-focused excerpt instead of the whole job post.
    summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)

    lines = ["Role summary:", summary]

    if info["requirements"]:
        lines.append("")
        lines.append("What they need from you:")
        for x in info["requirements"][:7]:
            lines.append(f"- {x}")

    if info["responsibilities"]:
        lines.append("")
        lines.append("What you would be doing:")
        for x in info["responsibilities"][:6]:
            lines.append(f"- {x}")

    if info["nice"]:
        lines.append("")
        lines.append("Nice to have:")
        for x in info["nice"][:5]:
            lines.append(f"- {x}")

    if info["tech"]:
        uniq = []
        for t in _rank_tech_skills(info["tech"]):
            if t not in uniq:
                uniq.append(t)
        lines.append("")
        lines.append("Top hard skills: " + ", ".join(uniq[:10]))

    if info["soft"]:
        uniq_soft = []
        for s in info["soft"]:
            if s not in uniq_soft:
                uniq_soft.append(s)
        lines.append("")
        lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))

    out = "\n".join(lines).strip()
    cache[key] = out
    return {"summary": out, "cached": False}
skills: " + ", ".join(uniq[:14]))

    if info["soft"]:
        uniq_soft = []
        for s in info["soft"]:
            if s not in uniq_soft:
                uniq_soft.append(s)
        lines.append("")
        lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))

    out = "\n".join(lines).strip()
    cache[key] = out
    return {"summary": out, "cached": False}