Files
jobtrackingapp/tools/summarizer/app.py
T

268 lines
8.7 KiB
Python

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache
import hashlib
import re
import torch
app = FastAPI(title="Local Summarizer")
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
cache = TTLCache(maxsize=1024, ttl=60 * 60)
class SummarizeRequest(BaseModel):
text: str = Field(min_length=1, max_length=MAX_INPUT_CHARS)
max_length: int = Field(default=160, ge=24, le=256)
min_length: int = Field(default=45, ge=8, le=180)
top_skills: int = Field(default=8, ge=3, le=12)
def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
h = hashlib.sha256(text.encode("utf-8")).hexdigest()
return f"{h}:{max_length}:{min_length}:{top_skills}"
@app.get("/health")
async def health():
return {"ok": True, "model": MODEL_NAME}
_TECH = [
"python", "c#", "dotnet", ".net", "java", "javascript", "typescript", "react", "node", "sql",
"postgres", "postgresql", "mysql", "sqlite", "mongodb", "redis", "aws", "azure", "gcp",
"docker", "kubernetes", "terraform", "linux", "git", "ci/cd", "graphql", "rest",
]
_SOFT = [
"communication", "collaboration", "teamwork", "problem solving", "leadership", "mentoring",
"ownership", "initiative", "adaptability", "stakeholder management", "detail oriented",
]
_TECH_PRIORITY = [
"python", "c#", ".net", "dotnet", "typescript", "javascript", "react", "node",
"sql", "postgresql", "postgres", "mysql", "sqlite", "docker", "kubernetes",
"aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
]
def _rank_tech_skills(skills):
ordered = []
seen = set()
for preferred in _TECH_PRIORITY:
for skill in skills:
if skill == preferred and skill not in seen:
ordered.append(skill)
seen.add(skill)
for skill in skills:
if skill not in seen:
ordered.append(skill)
seen.add(skill)
return ordered
def _strip_html(text: str) -> str:
text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", " ", text)
return re.sub(r"\n{3,}", "\n\n", text).strip()
def _extract_bullets(lines, max_items=8):
out = []
for ln in lines:
s = ln.strip()
if not s:
continue
if re.match(r"^([-*]|\u2022)\s+", s):
s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
if 3 <= len(s) <= 200:
out.append(s)
if len(out) >= max_items:
break
return out
def _top_keywords(text: str, limit=6):
words = re.findall(r"[a-zA-Z][a-zA-Z+#./-]{2,}", text.lower())
stop = {
"with", "from", "that", "this", "will", "have", "your", "their", "about", "role", "team", "work",
"experience", "skills", "requirements", "responsibilities", "company", "using", "ability", "years",
}
counts = {}
for word in words:
if word in stop or word in _TECH or word in _SOFT:
continue
counts[word] = counts.get(word, 0) + 1
ordered = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
return [word for word, _ in ordered[:limit]]
def _role_focused_excerpt(text: str) -> dict:
cleaned = _strip_html(text)
lines = [ln.strip() for ln in cleaned.splitlines()]
headings = {
"responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
"requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
"nice": ["nice to have", "bonus", "preferred"],
}
def match_heading(s: str):
sl = s.lower().strip(":-\x7f ")
for key, words in headings.items():
for word in words:
if sl == word or sl.startswith(word + " "):
return key
return None
section = None
resp_lines = []
req_lines = []
nice_lines = []
for ln in lines:
if not ln:
continue
heading = match_heading(ln)
if heading:
section = heading
continue
if section == "responsibilities":
resp_lines.append(ln)
elif section == "requirements":
req_lines.append(ln)
elif section == "nice":
nice_lines.append(ln)
responsibilities = _extract_bullets(resp_lines, max_items=7)
requirements = _extract_bullets(req_lines, max_items=7)
nice = _extract_bullets(nice_lines, max_items=5)
tech_found = []
soft_found = []
low = cleaned.lower()
for t in _TECH:
if t in low:
tech_found.append(t)
for s in _SOFT:
if s in low:
soft_found.append(s)
if not responsibilities and not requirements:
any_bullets = _extract_bullets(lines, max_items=10)
responsibilities = any_bullets[:6]
requirements = any_bullets[6:10]
focused_parts = []
if responsibilities:
focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
if requirements:
focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
if nice:
focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
focused_parts.append("Context:\n" + cleaned[:1500])
return {
"cleaned": cleaned,
"focused_input": "\n\n".join(focused_parts),
"responsibilities": responsibilities,
"requirements": requirements,
"nice": nice,
"tech": tech_found,
"soft": soft_found,
"keywords": _top_keywords(cleaned),
}
def _model_summarize(text: str, max_length: int, min_length: int) -> str:
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
with torch.no_grad():
outputs = model.generate(
input_ids,
attention_mask=attention_mask,
max_length=max_length,
min_length=min_length,
num_beams=4,
early_stopping=True,
)
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
@app.post("/summarize")
async def summarize(req: SummarizeRequest):
if req.min_length >= req.max_length:
raise HTTPException(status_code=400, detail="min_length must be smaller than max_length.")
key = _key(req.text, req.max_length, req.min_length, req.top_skills)
if key in cache:
return {"summary": cache[key], "cached": True}
info = _role_focused_excerpt(req.text)
summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)
lines = ["Role summary:", summary]
if info["requirements"]:
lines.append("")
lines.append("What the company wants most:")
for x in info["requirements"][:7]:
lines.append(f"- {x}")
if info["responsibilities"]:
lines.append("")
lines.append("What you would be doing:")
for x in info["responsibilities"][:6]:
lines.append(f"- {x}")
if info["nice"]:
lines.append("")
lines.append("Nice to have:")
for x in info["nice"][:5]:
lines.append(f"- {x}")
if info["tech"]:
uniq = []
for t in _rank_tech_skills(info["tech"]):
if t not in uniq:
uniq.append(t)
lines.append("")
lines.append("Top hard skills: " + ", ".join(uniq[: req.top_skills]))
if info["soft"]:
uniq_soft = []
for s in info["soft"]:
if s not in uniq_soft:
uniq_soft.append(s)
lines.append("")
lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))
if info["keywords"]:
lines.append("")
lines.append("Key themes: " + ", ".join(info["keywords"][:6]))
lines.append("")
lines.append("Interview focus:")
if info["requirements"]:
for x in info["requirements"][:3]:
lines.append(f"- Prepare examples that demonstrate: {x}")
elif info["tech"]:
for x in _rank_tech_skills(info["tech"])[:3]:
lines.append(f"- Be ready to explain your hands-on experience with {x}")
else:
lines.append("- Prepare examples showing relevant impact, collaboration, and delivery.")
out = "\n".join(lines).strip()
cache[key] = out
return {"summary": out, "cached": False}