270 lines
7.6 KiB
Python
270 lines
7.6 KiB
Python
from fastapi import FastAPI, HTTPException
|
|
from pydantic import BaseModel, Field
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
from cachetools import TTLCache
|
|
import hashlib
|
|
import re
|
|
import torch
|
|
|
|
app = FastAPI(title="Local Summarizer")
|
|
|
|
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
|
|
MAX_INPUT_CHARS = 20000
|
|
|
|
# The local summarizer is intentionally simple, but we still validate request sizes
|
|
# so accidental giant pastes do not cause avoidable latency or memory spikes.
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
|
model.eval()
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
model.to(device)
|
|
cache = TTLCache(maxsize=1024, ttl=60 * 60) # 1 hour cache
|
|
|
|
|
|
class SummarizeRequest(BaseModel):
|
|
text: str = Field(min_length=1, max_length=MAX_INPUT_CHARS)
|
|
max_length: int = Field(default=160, ge=24, le=256)
|
|
min_length: int = Field(default=45, ge=8, le=180)
|
|
|
|
|
|
def _key(text: str, max_length: int, min_length: int) -> str:
|
|
h = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
return f"{h}:{max_length}:{min_length}"
|
|
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return {"ok": True, "model": MODEL_NAME}
|
|
|
|
|
|
_TECH = [
|
|
"python",
|
|
"c#",
|
|
"dotnet",
|
|
".net",
|
|
"java",
|
|
"javascript",
|
|
"typescript",
|
|
"react",
|
|
"node",
|
|
"sql",
|
|
"postgres",
|
|
"postgresql",
|
|
"mysql",
|
|
"sqlite",
|
|
"mongodb",
|
|
"redis",
|
|
"aws",
|
|
"azure",
|
|
"gcp",
|
|
"docker",
|
|
"kubernetes",
|
|
"terraform",
|
|
"linux",
|
|
"git",
|
|
"ci/cd",
|
|
"graphql",
|
|
"rest",
|
|
]
|
|
|
|
_SOFT = [
|
|
"communication",
|
|
"collaboration",
|
|
"teamwork",
|
|
"problem solving",
|
|
"leadership",
|
|
"mentoring",
|
|
"ownership",
|
|
"initiative",
|
|
"adaptability",
|
|
"stakeholder management",
|
|
"detail oriented",
|
|
]
|
|
|
|
|
|
def _strip_html(text: str) -> str:
|
|
# Good enough for job descriptions pasted from the web.
|
|
text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
|
|
text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
|
|
text = re.sub(r"<[^>]+>", " ", text)
|
|
return re.sub(r"\n{3,}", "\n\n", text).strip()
|
|
|
|
|
|
def _extract_bullets(lines, max_items=8):
|
|
out = []
|
|
for ln in lines:
|
|
s = ln.strip()
|
|
if not s:
|
|
continue
|
|
if re.match(r"^([-*]|\u2022)\s+", s):
|
|
s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
|
|
if 3 <= len(s) <= 200:
|
|
out.append(s)
|
|
if len(out) >= max_items:
|
|
break
|
|
return out
|
|
|
|
|
|
def _role_focused_excerpt(text: str) -> dict:
|
|
cleaned = _strip_html(text)
|
|
lines = [ln.strip() for ln in cleaned.splitlines()]
|
|
|
|
headings = {
|
|
"responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
|
|
"requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
|
|
"nice": ["nice to have", "bonus", "preferred"],
|
|
}
|
|
|
|
def match_heading(s: str):
|
|
sl = s.lower().strip(":-\x7f ")
|
|
for k, words in headings.items():
|
|
for w in words:
|
|
if sl == w or sl.startswith(w + " "):
|
|
return k
|
|
return None
|
|
|
|
section = None
|
|
resp_lines = []
|
|
req_lines = []
|
|
nice_lines = []
|
|
|
|
for ln in lines:
|
|
if not ln:
|
|
continue
|
|
h = match_heading(ln)
|
|
if h:
|
|
section = h
|
|
continue
|
|
|
|
if section == "responsibilities":
|
|
resp_lines.append(ln)
|
|
elif section == "requirements":
|
|
req_lines.append(ln)
|
|
elif section == "nice":
|
|
nice_lines.append(ln)
|
|
|
|
responsibilities = _extract_bullets(resp_lines, max_items=7)
|
|
requirements = _extract_bullets(req_lines, max_items=7)
|
|
nice = _extract_bullets(nice_lines, max_items=5)
|
|
|
|
tech_found = []
|
|
soft_found = []
|
|
low = cleaned.lower()
|
|
for t in _TECH:
|
|
if t in low:
|
|
tech_found.append(t)
|
|
for s in _SOFT:
|
|
if s in low:
|
|
soft_found.append(s)
|
|
|
|
# Fallback: pick bullet-like lines anywhere if sections are missing.
|
|
if not responsibilities and not requirements:
|
|
any_bullets = _extract_bullets(lines, max_items=10)
|
|
responsibilities = any_bullets[:6]
|
|
requirements = any_bullets[6:10]
|
|
|
|
focused_parts = []
|
|
if responsibilities:
|
|
focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
|
|
if requirements:
|
|
focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
|
|
if nice:
|
|
focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
|
|
|
|
# Always include a small slice of the original for context.
|
|
focused_parts.append("Context:\n" + cleaned[:1500])
|
|
|
|
return {
|
|
"cleaned": cleaned,
|
|
"focused_input": "\n\n".join(focused_parts),
|
|
"responsibilities": responsibilities,
|
|
"requirements": requirements,
|
|
"nice": nice,
|
|
"tech": tech_found,
|
|
"soft": soft_found,
|
|
}
|
|
|
|
|
|
def _model_summarize(text: str, max_length: int, min_length: int) -> str:
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
|
|
input_ids = inputs.input_ids.to(device)
|
|
attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
|
|
with torch.no_grad():
|
|
outputs = model.generate(
|
|
input_ids,
|
|
attention_mask=attention_mask,
|
|
max_length=max_length,
|
|
min_length=min_length,
|
|
num_beams=4,
|
|
early_stopping=True,
|
|
)
|
|
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
|
|
|
|
|
|
@app.post("/summarize")
|
|
async def summarize(req: SummarizeRequest):
|
|
if req.min_length >= req.max_length:
|
|
raise HTTPException(status_code=400, detail="min_length must be smaller than max_length.")
|
|
|
|
key = _key(req.text, req.max_length, req.min_length)
|
|
if key in cache:
|
|
return {"summary": cache[key], "cached": True}
|
|
|
|
info = _role_focused_excerpt(req.text)
|
|
|
|
# Summarize the role-focused excerpt instead of the whole job post.
|
|
summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)
|
|
|
|
lines = ["Role summary:", summary]
|
|
|
|
if info["requirements"]:
|
|
lines.append("")
|
|
lines.append("What they need from you:")
|
|
for x in info["requirements"][:7]:
|
|
lines.append(f"- {x}")
|
|
|
|
if info["responsibilities"]:
|
|
lines.append("")
|
|
lines.append("What you would be doing:")
|
|
for x in info["responsibilities"][:6]:
|
|
lines.append(f"- {x}")
|
|
|
|
if info["nice"]:
|
|
lines.append("")
|
|
lines.append("Nice to have:")
|
|
for x in info["nice"][:5]:
|
|
lines.append(f"- {x}")
|
|
|
|
if info["tech"]:
|
|
uniq = []
|
|
for t in _rank_tech_skills(info["tech"]):
|
|
if t not in uniq:
|
|
uniq.append(t)
|
|
lines.append("")
|
|
lines.append("Top hard skills: " + ", ".join(uniq[:10]))
|
|
|
|
if info["soft"]:
|
|
uniq_soft = []
|
|
for s in info["soft"]:
|
|
if s not in uniq_soft:
|
|
uniq_soft.append(s)
|
|
lines.append("")
|
|
lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))
|
|
|
|
out = "\n".join(lines).strip()
|
|
cache[key] = out
|
|
return {"summary": out, "cached": False}
|
|
skills: " + ", ".join(uniq[:14]))
|
|
|
|
if info["soft"]:
|
|
uniq_soft = []
|
|
for s in info["soft"]:
|
|
if s not in uniq_soft:
|
|
uniq_soft.append(s)
|
|
lines.append("")
|
|
lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))
|
|
|
|
out = "\n".join(lines).strip()
|
|
cache[key] = out
|
|
return {"summary": out, "cached": False}
|