jobtrackingapp/tools/summarizer/app.py

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache
import hashlib
import re
import torch

app = FastAPI(title="Local Summarizer")

MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000
MAX_CONTEXT_CHARS = 2200


def _load_runtime():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    model.eval()
    has_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if has_cuda else "cpu")
    model.to(device)
    gpu_name = torch.cuda.get_device_name(0) if has_cuda else None
    return tokenizer, model, device, has_cuda, gpu_name


tokenizer, model, device, GPU_AVAILABLE, GPU_NAME = _load_runtime()
cache = TTLCache(maxsize=1024, ttl=60 * 60)


class SummarizeRequest(BaseModel):
    text: str = Field(min_length=1, max_length=MAX_INPUT_CHARS)
    max_length: int = Field(default=160, ge=24, le=256)
    min_length: int = Field(default=45, ge=8, le=180)
    top_skills: int = Field(default=8, ge=3, le=12)


def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
    return f"{h}:{max_length}:{min_length}:{top_skills}"


@app.get("/health")
async def health():
    return {
        "ok": True,
        "model": MODEL_NAME,
        "device": str(device),
        "gpu_available": GPU_AVAILABLE,
        "gpu_name": GPU_NAME,
    }


_TECH = [
    "python", "c#", "dotnet", ".net", "java", "javascript", "typescript", "react", "node", "sql",
    "postgres", "postgresql", "mysql", "sqlite", "mongodb", "redis", "aws", "azure", "gcp",
    "docker", "kubernetes", "terraform", "linux", "git", "ci/cd", "graphql", "rest",
]

_SOFT = [
    "communication", "collaboration", "teamwork", "problem solving", "leadership", "mentoring",
    "ownership", "initiative", "adaptability", "stakeholder management", "detail oriented",
]

_TECH_PRIORITY = [
    "python", "c#", ".net", "dotnet", "typescript", "javascript", "react", "node",
    "sql", "postgresql", "postgres", "mysql", "sqlite", "docker", "kubernetes",
    "aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
]


_MUST_HAVE_HINTS = [
    "must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
]
_NICE_TO_HAVE_HINTS = [
    "nice to have", "bonus", "preferred", "advantageous", "extra plus",
]
_SCREENING_HINTS = [
    "experience with", "hands-on", "demonstrated", "proven", "track record", "delivered",
]


def _rank_tech_skills(skills):
    ordered = []
    seen = set()
    for preferred in _TECH_PRIORITY:
        for skill in skills:
            if skill == preferred and skill not in seen:
                ordered.append(skill)
                seen.add(skill)
    for skill in skills:
        if skill not in seen:
            ordered.append(skill)
            seen.add(skill)
    return ordered


def _strip_html(text: str) -> str:
    text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"<[^>]+>", " ", text)
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _extract_bullets(lines, max_items=8):
    out = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        if re.match(r"^([-*]|\u2022)\s+", s):
            s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
            if 3 <= len(s) <= 220:
                out.append(s)
        if len(out) >= max_items:
            break
    return out


def _top_keywords(text: str, limit=6):
    words = re.findall(r"[a-zA-Z][a-zA-Z+#./-]{2,}", text.lower())
    stop = {
        "with", "from", "that", "this", "will", "have", "your", "their", "about", "role", "team", "work",
        "experience", "skills", "requirements", "responsibilities", "company", "using", "ability", "years",
        "looking", "candidate", "position", "working", "across", "strong", "building", "support",
    }
    counts = {}
    for word in words:
        if word in stop or word in _TECH or word in _SOFT:
            continue
        counts[word] = counts.get(word, 0) + 1
    ordered = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
    return [word for word, _ in ordered[:limit]]


def _first_matching_sentences(text: str, hints, limit=3):
    sentences = re.split(r"(?<=[.!?])\s+", text)
    found = []
    for sentence in sentences:
        low = sentence.lower()
        if any(hint in low for hint in hints):
            cleaned = sentence.strip()
            if 20 <= len(cleaned) <= 220:
                found.append(cleaned)
        if len(found) >= limit:
            break
    return found


def _trim_line(text: str, max_len: int = 140) -> str:
    text = re.sub(r"\s+", " ", text).strip(" -•\t")
    if len(text) <= max_len:
        return text
    return text[: max_len - 1].rstrip() + "…"


def _role_focused_excerpt(text: str) -> dict:
    cleaned = _strip_html(text)
    lines = [ln.strip() for ln in cleaned.splitlines()]

    headings = {
        "responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
        "requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
        "nice": ["nice to have", "bonus", "preferred"],
    }

    def match_heading(s: str):
        sl = s.lower().strip(":-\x7f ")
        for key, words in headings.items():
            for word in words:
                if sl == word or sl.startswith(word + " "):
                    return key
        return None

    section = None
    resp_lines = []
    req_lines = []
    nice_lines = []

    for ln in lines:
        if not ln:
            continue
        heading = match_heading(ln)
        if heading:
            section = heading
            continue
        if section == "responsibilities":
            resp_lines.append(ln)
        elif section == "requirements":
            req_lines.append(ln)
        elif section == "nice":
            nice_lines.append(ln)

    responsibilities = _extract_bullets(resp_lines, max_items=7)
    requirements = _extract_bullets(req_lines, max_items=7)
    nice = _extract_bullets(nice_lines, max_items=5)

    tech_found = []
    soft_found = []
    low = cleaned.lower()
    for t in _TECH:
        if t in low:
            tech_found.append(t)
    for s in _SOFT:
        if s in low:
            soft_found.append(s)

    if not responsibilities and not requirements:
        any_bullets = _extract_bullets(lines, max_items=10)
        responsibilities = any_bullets[:6]
        requirements = any_bullets[6:10]

    if not requirements:
        requirements = [_trim_line(x) for x in _first_matching_sentences(cleaned, _MUST_HAVE_HINTS, limit=4)]
    if not nice:
        nice = [_trim_line(x) for x in _first_matching_sentences(cleaned, _NICE_TO_HAVE_HINTS, limit=3)]

    focused_parts = []
    if responsibilities:
        focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
    if requirements:
        focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
    if nice:
        focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
    focused_parts.append("Context:\n" + cleaned[:MAX_CONTEXT_CHARS])

    screen_focus = []
    for item in requirements[:4]:
        if any(hint in item.lower() for hint in _SCREENING_HINTS) or len(screen_focus) < 2:
            screen_focus.append(_trim_line(item))
    if not screen_focus:
        screen_focus = [_trim_line(x) for x in _first_matching_sentences(cleaned, _SCREENING_HINTS, limit=3)]

    return {
        "cleaned": cleaned,
        "focused_input": "\n\n".join(focused_parts),
        "responsibilities": responsibilities,
        "requirements": requirements,
        "nice": nice,
        "tech": tech_found,
        "soft": soft_found,
        "keywords": _top_keywords(cleaned),
        "screen_focus": screen_focus[:3],
    }


def _model_summarize(text: str, max_length: int, min_length: int) -> str:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            min_length=min_length,
            num_beams=3,
            length_penalty=1.0,
            no_repeat_ngram_size=3,
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()


@app.post("/summarize")
async def summarize(req: SummarizeRequest):
    if req.min_length >= req.max_length:
        raise HTTPException(status_code=400, detail="min_length must be smaller than max_length.")

    key = _key(req.text, req.max_length, req.min_length, req.top_skills)
    if key in cache:
        return {"summary": cache[key], "cached": True}

    info = _role_focused_excerpt(req.text)
    summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)

    ranked_tech = []
    for t in _rank_tech_skills(info["tech"]):
        if t not in ranked_tech:
            ranked_tech.append(t)

    uniq_soft = []
    for s in info["soft"]:
        if s not in uniq_soft:
            uniq_soft.append(s)

    lines = ["Role summary:", summary]

    if info["requirements"]:
        lines.append("")
        lines.append("What the company wants most:")
        for x in info["requirements"][:5]:
            lines.append(f"- {_trim_line(x)}")

    if ranked_tech:
        lines.append("")
        lines.append("Top hard skills:")
        for skill in ranked_tech[: req.top_skills]:
            lines.append(f"- {skill}")

    if info["keywords"]:
        lines.append("")
        lines.append("Keywords to mirror:")
        for keyword in info["keywords"][:5]:
            lines.append(f"- {keyword}")

    if info["responsibilities"]:
        lines.append("")
        lines.append("What you would be doing:")
        for x in info["responsibilities"][:4]:
            lines.append(f"- {_trim_line(x)}")

    if info["nice"]:
        lines.append("")
        lines.append("Nice to have:")
        for x in info["nice"][:3]:
            lines.append(f"- {_trim_line(x)}")

    if uniq_soft:
        lines.append("")
        lines.append("Relevant soft skills:")
        for soft in uniq_soft[:5]:
            lines.append(f"- {soft}")

    lines.append("")
    lines.append("Interview focus:")
    if info["screen_focus"]:
        for x in info["screen_focus"]:
            lines.append(f"- Be ready to prove: {_trim_line(x)}")
    elif info["requirements"]:
        for x in info["requirements"][:3]:
            lines.append(f"- Prepare examples that demonstrate: {_trim_line(x)}")
    elif ranked_tech:
        for x in ranked_tech[:3]:
            lines.append(f"- Be ready to explain your hands-on experience with {x}")
    else:
        lines.append("- Prepare examples showing relevant impact, collaboration, and delivery.")

    out = "\n".join(lines).strip()
    cache[key] = out
    return {"summary": out, "cached": False}