jobtrackingapp/tools/summarizer/app.py

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache
import hashlib
import re
import torch

app = FastAPI(title="Local Summarizer")

MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
cache = TTLCache(maxsize=1024, ttl=60 * 60)


class SummarizeRequest(BaseModel):
    text: str = Field(min_length=1, max_length=MAX_INPUT_CHARS)
    max_length: int = Field(default=160, ge=24, le=256)
    min_length: int = Field(default=45, ge=8, le=180)
    top_skills: int = Field(default=8, ge=3, le=12)


def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
    return f"{h}:{max_length}:{min_length}:{top_skills}"


@app.get("/health")
async def health():
    return {"ok": True, "model": MODEL_NAME}


_TECH = [
    "python", "c#", "dotnet", ".net", "java", "javascript", "typescript", "react", "node", "sql",
    "postgres", "postgresql", "mysql", "sqlite", "mongodb", "redis", "aws", "azure", "gcp",
    "docker", "kubernetes", "terraform", "linux", "git", "ci/cd", "graphql", "rest",
]

_SOFT = [
    "communication", "collaboration", "teamwork", "problem solving", "leadership", "mentoring",
    "ownership", "initiative", "adaptability", "stakeholder management", "detail oriented",
]

_TECH_PRIORITY = [
    "python", "c#", ".net", "dotnet", "typescript", "javascript", "react", "node",
    "sql", "postgresql", "postgres", "mysql", "sqlite", "docker", "kubernetes",
    "aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
]


def _rank_tech_skills(skills):
    ordered = []
    seen = set()
    for preferred in _TECH_PRIORITY:
        for skill in skills:
            if skill == preferred and skill not in seen:
                ordered.append(skill)
                seen.add(skill)
    for skill in skills:
        if skill not in seen:
            ordered.append(skill)
            seen.add(skill)
    return ordered


def _strip_html(text: str) -> str:
    text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"<[^>]+>", " ", text)
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _extract_bullets(lines, max_items=8):
    out = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        if re.match(r"^([-*]|\u2022)\s+", s):
            s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
            if 3 <= len(s) <= 200:
                out.append(s)
        if len(out) >= max_items:
            break
    return out


def _top_keywords(text: str, limit=6):
    words = re.findall(r"[a-zA-Z][a-zA-Z+#./-]{2,}", text.lower())
    stop = {
        "with", "from", "that", "this", "will", "have", "your", "their", "about", "role", "team", "work",
        "experience", "skills", "requirements", "responsibilities", "company", "using", "ability", "years",
    }
    counts = {}
    for word in words:
        if word in stop or word in _TECH or word in _SOFT:
            continue
        counts[word] = counts.get(word, 0) + 1
    ordered = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
    return [word for word, _ in ordered[:limit]]


def _role_focused_excerpt(text: str) -> dict:
    cleaned = _strip_html(text)
    lines = [ln.strip() for ln in cleaned.splitlines()]

    headings = {
        "responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
        "requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
        "nice": ["nice to have", "bonus", "preferred"],
    }

    def match_heading(s: str):
        sl = s.lower().strip(":-\x7f ")
        for key, words in headings.items():
            for word in words:
                if sl == word or sl.startswith(word + " "):
                    return key
        return None

    section = None
    resp_lines = []
    req_lines = []
    nice_lines = []

    for ln in lines:
        if not ln:
            continue
        heading = match_heading(ln)
        if heading:
            section = heading
            continue
        if section == "responsibilities":
            resp_lines.append(ln)
        elif section == "requirements":
            req_lines.append(ln)
        elif section == "nice":
            nice_lines.append(ln)

    responsibilities = _extract_bullets(resp_lines, max_items=7)
    requirements = _extract_bullets(req_lines, max_items=7)
    nice = _extract_bullets(nice_lines, max_items=5)

    tech_found = []
    soft_found = []
    low = cleaned.lower()
    for t in _TECH:
        if t in low:
            tech_found.append(t)
    for s in _SOFT:
        if s in low:
            soft_found.append(s)

    if not responsibilities and not requirements:
        any_bullets = _extract_bullets(lines, max_items=10)
        responsibilities = any_bullets[:6]
        requirements = any_bullets[6:10]

    focused_parts = []
    if responsibilities:
        focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
    if requirements:
        focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
    if nice:
        focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
    focused_parts.append("Context:\n" + cleaned[:1500])

    return {
        "cleaned": cleaned,
        "focused_input": "\n\n".join(focused_parts),
        "responsibilities": responsibilities,
        "requirements": requirements,
        "nice": nice,
        "tech": tech_found,
        "soft": soft_found,
        "keywords": _top_keywords(cleaned),
    }


def _model_summarize(text: str, max_length: int, min_length: int) -> str:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            min_length=min_length,
            num_beams=4,
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()


@app.post("/summarize")
async def summarize(req: SummarizeRequest):
    if req.min_length >= req.max_length:
        raise HTTPException(status_code=400, detail="min_length must be smaller than max_length.")

    key = _key(req.text, req.max_length, req.min_length, req.top_skills)
    if key in cache:
        return {"summary": cache[key], "cached": True}

    info = _role_focused_excerpt(req.text)
    summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)

    lines = ["Role summary:", summary]

    if info["requirements"]:
        lines.append("")
        lines.append("What the company wants most:")
        for x in info["requirements"][:7]:
            lines.append(f"- {x}")

    if info["responsibilities"]:
        lines.append("")
        lines.append("What you would be doing:")
        for x in info["responsibilities"][:6]:
            lines.append(f"- {x}")

    if info["nice"]:
        lines.append("")
        lines.append("Nice to have:")
        for x in info["nice"][:5]:
            lines.append(f"- {x}")

    if info["tech"]:
        uniq = []
        for t in _rank_tech_skills(info["tech"]):
            if t not in uniq:
                uniq.append(t)
        lines.append("")
        lines.append("Top hard skills: " + ", ".join(uniq[: req.top_skills]))

    if info["soft"]:
        uniq_soft = []
        for s in info["soft"]:
            if s not in uniq_soft:
                uniq_soft.append(s)
        lines.append("")
        lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))

    if info["keywords"]:
        lines.append("")
        lines.append("Key themes: " + ", ".join(info["keywords"][:6]))

    lines.append("")
    lines.append("Interview focus:")
    if info["requirements"]:
        for x in info["requirements"][:3]:
            lines.append(f"- Prepare examples that demonstrate: {x}")
    elif info["tech"]:
        for x in _rank_tech_skills(info["tech"])[:3]:
            lines.append(f"- Be ready to explain your hands-on experience with {x}")
    else:
        lines.append("- Prepare examples showing relevant impact, collaboration, and delivery.")

    out = "\n".join(lines).strip()
    cache[key] = out
    return {"summary": out, "cached": False}