First Commit

2026-03-21 11:55:27 +01:00
commit 2e8a29b4d0
1757 changed files with 166084 additions and 0 deletions
@@ -0,0 +1,7 @@
+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 8001
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8001"]
@@ -0,0 +1,32 @@
+# Local Hugging Face Summarizer
+
+This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API.
+
+Install (recommended: virtualenv)
+
+Windows (CPU PyTorch wheel may be required):
+
+```powershell
+python -m venv .venv
+.\.venv\Scripts\Activate.ps1
+pip install -r requirements.txt
+# If torch wheel installation is needed, follow instructions at https://pytorch.org
+python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
+```
+
+Linux / macOS:
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
+```
+
+API
+- `GET /health` — health check
+- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }`
+
+Notes
+- Model will be downloaded on first run and can be several hundred MB.
+- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`.
@@ -0,0 +1,218 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from cachetools import TTLCache
+import hashlib
+import re
+import torch
+
+app = FastAPI(title="Local Summarizer")
+
+MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+model.eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+cache = TTLCache(maxsize=1024, ttl=60 * 60)  # 1 hour cache
+
+
+class SummarizeRequest(BaseModel):
+    text: str
+    max_length: int = 160
+    min_length: int = 45
+
+
+def _key(text: str, max_length: int, min_length: int) -> str:
+    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
+    return f"{h}:{max_length}:{min_length}"
+
+
+@app.get("/health")
+async def health():
+    return {"ok": True, "model": MODEL_NAME}
+
+
+_TECH = [
+    "python",
+    "c#",
+    "dotnet",
+    ".net",
+    "java",
+    "javascript",
+    "typescript",
+    "react",
+    "node",
+    "sql",
+    "postgres",
+    "postgresql",
+    "mysql",
+    "sqlite",
+    "mongodb",
+    "redis",
+    "aws",
+    "azure",
+    "gcp",
+    "docker",
+    "kubernetes",
+    "terraform",
+    "linux",
+    "git",
+    "ci/cd",
+    "graphql",
+    "rest",
+]
+
+
+def _strip_html(text: str) -> str:
+    # Good enough for job descriptions pasted from the web.
+    text = re.sub(r"<\s*br\s*/?>", "\n", text, flags=re.IGNORECASE)
+    text = re.sub(r"</p\s*>", "\n", text, flags=re.IGNORECASE)
+    text = re.sub(r"<[^>]+>", " ", text)
+    return re.sub(r"\n{3,}", "\n\n", text).strip()
+
+
+def _extract_bullets(lines, max_items=8):
+    out = []
+    for ln in lines:
+        s = ln.strip()
+        if not s:
+            continue
+        if re.match(r"^([-*]|\u2022)\s+", s):
+            s = re.sub(r"^([-*]|\u2022)\s+", "", s).strip()
+            if 3 <= len(s) <= 200:
+                out.append(s)
+        if len(out) >= max_items:
+            break
+    return out
+
+
+def _role_focused_excerpt(text: str) -> dict:
+    cleaned = _strip_html(text)
+    lines = [ln.strip() for ln in cleaned.splitlines()]
+
+    headings = {
+        "responsibilities": ["responsibilities", "what you will do", "what you'll do", "the role", "your role", "you will"],
+        "requirements": ["requirements", "what we are looking for", "what we're looking for", "skills", "experience", "must have"],
+        "nice": ["nice to have", "bonus", "preferred"],
+    }
+
+    def match_heading(s: str):
+        sl = s.lower().strip(":- ")
+        for k, words in headings.items():
+            for w in words:
+                if sl == w or sl.startswith(w + " "):
+                    return k
+        return None
+
+    section = None
+    resp_lines = []
+    req_lines = []
+    nice_lines = []
+
+    for ln in lines:
+        if not ln:
+            continue
+        h = match_heading(ln)
+        if h:
+            section = h
+            continue
+
+        if section == "responsibilities":
+            resp_lines.append(ln)
+        elif section == "requirements":
+            req_lines.append(ln)
+        elif section == "nice":
+            nice_lines.append(ln)
+
+    responsibilities = _extract_bullets(resp_lines, max_items=7)
+    requirements = _extract_bullets(req_lines, max_items=7)
+    nice = _extract_bullets(nice_lines, max_items=5)
+
+    tech_found = []
+    low = cleaned.lower()
+    for t in _TECH:
+        if t in low:
+            tech_found.append(t)
+
+    # Fallback: pick bullet-like lines anywhere if sections are missing.
+    if not responsibilities and not requirements:
+        any_bullets = _extract_bullets(lines, max_items=10)
+        responsibilities = any_bullets[:6]
+        requirements = any_bullets[6:10]
+
+    focused_parts = []
+    if responsibilities:
+        focused_parts.append("Responsibilities:\n- " + "\n- ".join(responsibilities))
+    if requirements:
+        focused_parts.append("Requirements:\n- " + "\n- ".join(requirements))
+    if nice:
+        focused_parts.append("Nice to have:\n- " + "\n- ".join(nice))
+
+    # Always include a small slice of the original for context.
+    focused_parts.append("Context:\n" + cleaned[:1500])
+
+    return {
+        "cleaned": cleaned,
+        "focused_input": "\n\n".join(focused_parts),
+        "responsibilities": responsibilities,
+        "requirements": requirements,
+        "nice": nice,
+        "tech": tech_found,
+    }
+
+
+def _model_summarize(text: str, max_length: int, min_length: int) -> str:
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
+    input_ids = inputs.input_ids.to(device)
+    attention_mask = inputs.attention_mask.to(device) if hasattr(inputs, "attention_mask") else None
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_length=max_length,
+            min_length=min_length,
+            num_beams=4,
+            early_stopping=True,
+        )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+
+
+@app.post("/summarize")
+async def summarize(req: SummarizeRequest):
+    key = _key(req.text, req.max_length, req.min_length)
+    if key in cache:
+        return {"summary": cache[key], "cached": True}
+
+    info = _role_focused_excerpt(req.text)
+
+    # Summarize the role-focused excerpt instead of the whole job post.
+    summary = _model_summarize(info["focused_input"], req.max_length, req.min_length)
+
+    lines = ["Role summary:", summary]
+
+    if info["responsibilities"]:
+        lines.append("")
+        lines.append("Key responsibilities:")
+        for x in info["responsibilities"][:6]:
+            lines.append(f"- {x}")
+
+    if info["requirements"]:
+        lines.append("")
+        lines.append("Key requirements:")
+        for x in info["requirements"][:6]:
+            lines.append(f"- {x}")
+
+    if info["tech"]:
+        # Keep this short; it's just a hint based on keyword matches.
+        uniq = []
+        for t in info["tech"]:
+            if t not in uniq:
+                uniq.append(t)
+        lines.append("")
+        lines.append("Tech keywords: " + ", ".join(uniq[:14]))
+
+    out = "\n".join(lines).strip()
+    cache[key] = out
+    return {"summary": out, "cached": False}
@@ -0,0 +1,6 @@
+fastapi>=0.85
+uvicorn[standard]>=0.18
+transformers>=4.30
+torch>=1.13
+cachetools>=5.0
+pydantic>=1.10
@@ -0,0 +1,12 @@
+Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
+D:\Job tracker\.venv\Lib\site-packages\huggingface_hub\file_download.py:129: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\Cesnimda\.cache\huggingface\hub\models--sshleifer--distilbart-cnn-12-6. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
+To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
+  warnings.warn(message)
+Please make sure the generation config includes `forced_bos_token_id=0`. 
+
+Loading weights:   0%|          | 0/358 [00:00<?, ?it/s]
+Loading weights: 100%|##########| 358/358 [00:00<00:00, 24690.63it/s]
+The tied weights mapping and config for this model specifies to tie model.shared.weight to model.decoder.embed_tokens.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
+The tied weights mapping and config for this model specifies to tie model.shared.weight to model.encoder.embed_tokens.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
+INFO:     Started server process [30848]
+INFO:     Waiting for application startup.
@@ -0,0 +1,8 @@
+INFO:     127.0.0.1:61287 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:56126 - "POST /summarize HTTP/1.1" 200 OK
+INFO:     127.0.0.1:55075 - "POST /summarize HTTP/1.1" 200 OK
+INFO:     127.0.0.1:55075 - "POST /summarize HTTP/1.1" 200 OK
+INFO:     127.0.0.1:55075 - "POST /summarize HTTP/1.1" 200 OK
+INFO:     127.0.0.1:55075 - "POST /summarize HTTP/1.1" 200 OK
+INFO:     127.0.0.1:55075 - "POST /summarize HTTP/1.1" 200 OK
+INFO:     127.0.0.1:51005 - "POST /summarize HTTP/1.1" 200 OK