Files
jobtrackingapp/tools/summarizer/app.py
T

270 lines
7.6 KiB
Python

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache
import hashlib
import re
import torch
app = FastAPI(title="Local Summarizer")
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000
# The local summarizer is intentionally simple, but we still validate request sizes
# so accidental giant pastes do not cause avoidable latency or memory spikes.
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
cache = TTLCache(maxsize=1024, ttl=60 * 60) # 1 hour cache
class SummarizeRequest(BaseModel):
text: str = Field(min_length=1, max_length=MAX_INPUT_CHARS)
max_length: int = Field(default=160, ge=24, le=256)
min_length: int = Field(default=45, ge=8, le=180)
def _key(text: str, max_length: int, min_length: int) -> str:
h = hashlib.sha256(text.encode("utf-8")).hexdigest()
return f"{h}:{max_length}:{min_length}"
@app.get("/health")
async def health():
return {"ok": True, "model": MODEL_NAME}
_TECH = [
"python",
"c#",
"dotnet",
".net",
"java",
"javascript",
"typescript",
"react",
"node",
"sql",
"postgres",
"postgresql",
"mysql",
"sqlite",
"mongodb",
"redis",
"aws",
"azure",
"gcp",
"docker",
"kubernetes",
"terraform",
"linux",
"git",
"ci/cd",
"graphql",
"rest",
]
_SOFT = [
"communication",
"collaboration",
"teamwork",
"problem solving",
"leadership",
"mentoring",
"ownership",
"initiative",
"adaptability",
"stakeholder management",
"detail oriented",
]
def _strip_html(text: str) -> str:
# Good enough for job descriptions pasted from the web.
text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", " ", text)
return re.sub(r"\n{3,}", "\n\n", text).strip()
def _extract_bullets(lines, max_items=8):
out = []
for ln in lines:
s = ln.strip()
if not s:
continue
if re.match(r"^([-*]|\u2022)\s+", s):
s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
if 3 <= len(s) <= 200:
out.append(s)
if len(out) >= max_items:
break
return out
def _role_focused_excerpt(text: str) -> dict:
cleaned = _strip_html(text)
lines = [ln.strip() for ln in cleaned.splitlines()]
headings = {
"responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
"requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
"nice": ["nice to have", "bonus", "preferred"],
}
def match_heading(s: str):
sl = s.lower().strip(":-\x7f ")
for k, words in headings.items():
for w in words:
if sl == w or sl.startswith(w + " "):
return k
return None
section = None
resp_lines = []
req_lines = []
nice_lines = []
for ln in lines:
if not ln:
continue
h = match_heading(ln)
if h:
section = h
continue
if section == "responsibilities":
resp_lines.append(ln)
elif section == "requirements":
req_lines.append(ln)
elif section == "nice":
nice_lines.append(ln)
responsibilities = _extract_bullets(resp_lines, max_items=7)
requirements = _extract_bullets(req_lines, max_items=7)
nice = _extract_bullets(nice_lines, max_items=5)
tech_found = []
soft_found = []
low = cleaned.lower()
for t in _TECH:
if t in low:
tech_found.append(t)
for s in _SOFT:
if s in low:
soft_found.append(s)
# Fallback: pick bullet-like lines anywhere if sections are missing.
if not responsibilities and not requirements:
any_bullets = _extract_bullets(lines, max_items=10)
responsibilities = any_bullets[:6]
requirements = any_bullets[6:10]
focused_parts = []
if responsibilities:
focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
if requirements:
focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
if nice:
focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
# Always include a small slice of the original for context.
focused_parts.append("Context:\n" + cleaned[:1500])
return {
"cleaned": cleaned,
"focused_input": "\n\n".join(focused_parts),
"responsibilities": responsibilities,
"requirements": requirements,
"nice": nice,
"tech": tech_found,
"soft": soft_found,
}
def _model_summarize(text: str, max_length: int, min_length: int) -> str:
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
with torch.no_grad():
outputs = model.generate(
input_ids,
attention_mask=attention_mask,
max_length=max_length,
min_length=min_length,
num_beams=4,
early_stopping=True,
)
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
@app.post("/summarize")
async def summarize(req: SummarizeRequest):
if req.min_length >= req.max_length:
raise HTTPException(status_code=400, detail="min_length must be smaller than max_length.")
key = _key(req.text, req.max_length, req.min_length)
if key in cache:
return {"summary": cache[key], "cached": True}
info = _role_focused_excerpt(req.text)
# Summarize the role-focused excerpt instead of the whole job post.
summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)
lines = ["Role summary:", summary]
if info["requirements"]:
lines.append("")
lines.append("What they need from you:")
for x in info["requirements"][:7]:
lines.append(f"- {x}")
if info["responsibilities"]:
lines.append("")
lines.append("What you would be doing:")
for x in info["responsibilities"][:6]:
lines.append(f"- {x}")
if info["nice"]:
lines.append("")
lines.append("Nice to have:")
for x in info["nice"][:5]:
lines.append(f"- {x}")
if info["tech"]:
uniq = []
for t in _rank_tech_skills(info["tech"]):
if t not in uniq:
uniq.append(t)
lines.append("")
lines.append("Top hard skills: " + ", ".join(uniq[:10]))
if info["soft"]:
uniq_soft = []
for s in info["soft"]:
if s not in uniq_soft:
uniq_soft.append(s)
lines.append("")
lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))
out = "\n".join(lines).strip()
cache[key] = out
return {"summary": out, "cached": False}
skills: " + ", ".join(uniq[:14]))
if info["soft"]:
uniq_soft = []
for s in info["soft"]:
if s not in uniq_soft:
uniq_soft.append(s)
lines.append("")
lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))
out = "\n".join(lines).strip()
cache[key] = out
return {"summary": out, "cached": False}