feat: add application package generation and grouped readiness workflows

2026-03-22 18:28:02 +01:00
parent f1c7c38a19
commit 9188039e9d
14 changed files with 1014 additions and 373 deletions
@@ -11,25 +11,24 @@ app = FastAPI(title="Local Summarizer")
 MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
 MAX_INPUT_CHARS = 20000

-# The local summarizer is intentionally simple, but we still validate request sizes
-# so accidental giant pastes do not cause avoidable latency or memory spikes.
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
 model.eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-cache = TTLCache(maxsize=1024, ttl=60 * 60)  # 1 hour cache
+cache = TTLCache(maxsize=1024, ttl=60 * 60)


 class SummarizeRequest(BaseModel):
    text: str = Field(min_length=1, max_length=MAX_INPUT_CHARS)
    max_length: int = Field(default=160, ge=24, le=256)
    min_length: int = Field(default=45, ge=8, le=180)
+    top_skills: int = Field(default=8, ge=3, le=12)


-def _key(text: str, max_length: int, min_length: int) -> str:
+def _key(text: str, max_length: int, min_length: int, top_skills: int) -> str:
    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
-    return f"{h}:{max_length}:{min_length}"
+    return f"{h}:{max_length}:{min_length}:{top_skills}"


@app.get("/health")
@@ -38,52 +37,39 @@ async def health():


 _TECH = [
-    "python",
-    "c#",
-    "dotnet",
-    ".net",
-    "java",
-    "javascript",
-    "typescript",
-    "react",
-    "node",
-    "sql",
-    "postgres",
-    "postgresql",
-    "mysql",
-    "sqlite",
-    "mongodb",
-    "redis",
-    "aws",
-    "azure",
-    "gcp",
-    "docker",
-    "kubernetes",
-    "terraform",
-    "linux",
-    "git",
-    "ci/cd",
-    "graphql",
-    "rest",
+    "python", "c#", "dotnet", ".net", "java", "javascript", "typescript", "react", "node", "sql",
+    "postgres", "postgresql", "mysql", "sqlite", "mongodb", "redis", "aws", "azure", "gcp",
+    "docker", "kubernetes", "terraform", "linux", "git", "ci/cd", "graphql", "rest",
 ]

 _SOFT = [
-    "communication",
-    "collaboration",
-    "teamwork",
-    "problem solving",
-    "leadership",
-    "mentoring",
-    "ownership",
-    "initiative",
-    "adaptability",
-    "stakeholder management",
-    "detail oriented",
+    "communication", "collaboration", "teamwork", "problem solving", "leadership", "mentoring",
+    "ownership", "initiative", "adaptability", "stakeholder management", "detail oriented",
+]
+
+_TECH_PRIORITY = [
+    "python", "c#", ".net", "dotnet", "typescript", "javascript", "react", "node",
+    "sql", "postgresql", "postgres", "mysql", "sqlite", "docker", "kubernetes",
+    "aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
 ]


+def _rank_tech_skills(skills):
+    ordered = []
+    seen = set()
+    for preferred in _TECH_PRIORITY:
+        for skill in skills:
+            if skill == preferred and skill not in seen:
+                ordered.append(skill)
+                seen.add(skill)
+    for skill in skills:
+        if skill not in seen:
+            ordered.append(skill)
+            seen.add(skill)
+    return ordered
+
+
 def _strip_html(text: str) -> str:
-    # Good enough for job descriptions pasted from the web.
    text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
    text = re.sub(r"<[^>]+>", " ", text)
@@ -105,6 +91,21 @@ def _extract_bullets(lines, max_items=8):
    return out


+def _top_keywords(text: str, limit=6):
+    words = re.findall(r"[a-zA-Z][a-zA-Z+#./-]{2,}", text.lower())
+    stop = {
+        "with", "from", "that", "this", "will", "have", "your", "their", "about", "role", "team", "work",
+        "experience", "skills", "requirements", "responsibilities", "company", "using", "ability", "years",
+    }
+    counts = {}
+    for word in words:
+        if word in stop or word in _TECH or word in _SOFT:
+            continue
+        counts[word] = counts.get(word, 0) + 1
+    ordered = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
+    return [word for word, _ in ordered[:limit]]
+
+
 def _role_focused_excerpt(text: str) -> dict:
    cleaned = _strip_html(text)
    lines = [ln.strip() for ln in cleaned.splitlines()]
@@ -117,10 +118,10 @@ def _role_focused_excerpt(text: str) -> dict:

    def match_heading(s: str):
        sl = s.lower().strip(":-\x7f ")
-        for k, words in headings.items():
-            for w in words:
-                if sl == w or sl.startswith(w + " "):
-                    return k
+        for key, words in headings.items():
+            for word in words:
+                if sl == word or sl.startswith(word + " "):
+                    return key
        return None

    section = None
@@ -131,11 +132,10 @@ def _role_focused_excerpt(text: str) -> dict:
    for ln in lines:
        if not ln:
            continue
-        h = match_heading(ln)
-        if h:
-            section = h
+        heading = match_heading(ln)
+        if heading:
+            section = heading
            continue
-
        if section == "responsibilities":
            resp_lines.append(ln)
        elif section == "requirements":
@@ -157,7 +157,6 @@ def _role_focused_excerpt(text: str) -> dict:
        if s in low:
            soft_found.append(s)

-    # Fallback: pick bullet-like lines anywhere if sections are missing.
    if not responsibilities and not requirements:
        any_bullets = _extract_bullets(lines, max_items=10)
        responsibilities = any_bullets[:6]
@@ -170,8 +169,6 @@ def _role_focused_excerpt(text: str) -> dict:
        focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
    if nice:
        focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
-
-    # Always include a small slice of the original for context.
    focused_parts.append("Context:\n" + cleaned[:1500])

    return {
@@ -182,6 +179,7 @@ def _role_focused_excerpt(text: str) -> dict:
        "nice": nice,
        "tech": tech_found,
        "soft": soft_found,
+        "keywords": _top_keywords(cleaned),
    }


@@ -206,20 +204,18 @@ async def summarize(req: SummarizeRequest):
    if req.min_length >= req.max_length:
        raise HTTPException(status_code=400, detail="min_length must be smaller than max_length.")

-    key = _key(req.text, req.max_length, req.min_length)
+    key = _key(req.text, req.max_length, req.min_length, req.top_skills)
    if key in cache:
        return {"summary": cache[key], "cached": True}

    info = _role_focused_excerpt(req.text)
-
-    # Summarize the role-focused excerpt instead of the whole job post.
    summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)

    lines = ["Role summary:", summary]

    if info["requirements"]:
        lines.append("")
-        lines.append("What they need from you:")
+        lines.append("What the company wants most:")
        for x in info["requirements"][:7]:
            lines.append(f"- {x}")

@@ -241,7 +237,7 @@ async def summarize(req: SummarizeRequest):
            if t not in uniq:
                uniq.append(t)
        lines.append("")
-        lines.append("Top hard skills: " + ", ".join(uniq[:10]))
+        lines.append("Top hard skills: " + ", ".join(uniq[: req.top_skills]))

    if info["soft"]:
        uniq_soft = []
@@ -251,18 +247,20 @@ async def summarize(req: SummarizeRequest):
        lines.append("")
        lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))

-    out = "\n".join(lines).strip()
-    cache[key] = out
-    return {"summary": out, "cached": False}
-skills: " + ", ".join(uniq[:14]))
-
-    if info["soft"]:
-        uniq_soft = []
-        for s in info["soft"]:
-            if s not in uniq_soft:
-                uniq_soft.append(s)
+    if info["keywords"]:
        lines.append("")
-        lines.append("Relevant soft skills: " + ", ".join(uniq_soft[:8]))
+        lines.append("Key themes: " + ", ".join(info["keywords"][:6]))
+
+    lines.append("")
+    lines.append("Interview focus:")
+    if info["requirements"]:
+        for x in info["requirements"][:3]:
+            lines.append(f"- Prepare examples that demonstrate: {x}")
+    elif info["tech"]:
+        for x in _rank_tech_skills(info["tech"])[:3]:
+            lines.append(f"- Be ready to explain your hands-on experience with {x}")
+    else:
+        lines.append("- Prepare examples showing relevant impact, collaboration, and delivery.")

    out = "\n".join(lines).strip()
    cache[key] = out