Evolve summarizer into AI service with OCR support
This commit is contained in:
@@ -5,6 +5,9 @@ ENV PIP_NO_CACHE_DIR=1 \
|
||||
TRANSFORMERS_NO_TF=1 \
|
||||
HF_HUB_DISABLE_TELEMETRY=1
|
||||
WORKDIR /app
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends tesseract-ocr tesseract-ocr-eng \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
COPY requirements.txt ./
|
||||
RUN python -m pip install --upgrade pip setuptools wheel \
|
||||
&& python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
|
||||
|
||||
+22
-11
@@ -1,16 +1,22 @@
|
||||
# Local Hugging Face Summarizer
|
||||
# Local AI Service
|
||||
|
||||
This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API.
|
||||
This service runs a local Hugging Face summarization model and also exposes document text extraction with OCR for supported PDFs and images.
|
||||
|
||||
Install (recommended: virtualenv)
|
||||
## Capabilities
|
||||
- job/role summarization
|
||||
- PDF text extraction
|
||||
- OCR fallback for scanned PDFs
|
||||
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
|
||||
- DOCX / TXT / MD extraction
|
||||
|
||||
Windows (CPU PyTorch wheel may be required):
|
||||
## Install
|
||||
|
||||
Windows:
|
||||
|
||||
```powershell
|
||||
python -m venv .venv
|
||||
.\.venv\Scripts\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
# If torch wheel installation is needed, follow instructions at https://pytorch.org
|
||||
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
|
||||
```
|
||||
|
||||
@@ -23,10 +29,15 @@ pip install -r requirements.txt
|
||||
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
|
||||
```
|
||||
|
||||
API
|
||||
- `GET /health` — health check
|
||||
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }`
|
||||
## Docker
|
||||
The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can be processed inside the container.
|
||||
|
||||
Notes
|
||||
- Model will be downloaded on first run and can be several hundred MB.
|
||||
- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`.
|
||||
## API
|
||||
- `GET /health` — health check and runtime capabilities
|
||||
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
|
||||
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
|
||||
|
||||
## Notes
|
||||
- Model weights are downloaded on first run.
|
||||
- OCR quality depends on scan quality and language support.
|
||||
- Default OCR language is English (`eng`).
|
||||
|
||||
+107
-3
@@ -1,16 +1,25 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
from pydantic import BaseModel, Field
|
||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
from cachetools import TTLCache
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader
|
||||
from docx import Document
|
||||
import fitz
|
||||
import hashlib
|
||||
import io
|
||||
import re
|
||||
import torch
|
||||
import pytesseract
|
||||
|
||||
app = FastAPI(title="Local Summarizer")
|
||||
app = FastAPI(title="Local AI Service")
|
||||
|
||||
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
|
||||
MAX_INPUT_CHARS = 20000
|
||||
MAX_CONTEXT_CHARS = 2200
|
||||
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
|
||||
OCR_LANGUAGES = "eng"
|
||||
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
||||
|
||||
|
||||
def _load_runtime():
|
||||
@@ -48,6 +57,8 @@ async def health():
|
||||
"device": str(device),
|
||||
"gpu_available": GPU_AVAILABLE,
|
||||
"gpu_name": GPU_NAME,
|
||||
"ocr_available": True,
|
||||
"ocr_languages": OCR_LANGUAGES,
|
||||
}
|
||||
|
||||
|
||||
@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
|
||||
"aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
|
||||
]
|
||||
|
||||
|
||||
_MUST_HAVE_HINTS = [
|
||||
"must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
|
||||
]
|
||||
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
|
||||
out = "\n".join(lines).strip()
|
||||
cache[key] = out
|
||||
return {"summary": out, "cached": False}
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> str:
|
||||
value = value.replace("\x00", " ")
|
||||
return re.sub(r"\s+", " ", value).strip()
|
||||
|
||||
|
||||
def _ocr_image(image: Image.Image) -> str:
|
||||
if image.mode not in ("RGB", "L"):
|
||||
image = image.convert("RGB")
|
||||
text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
|
||||
return _normalize_text(text)
|
||||
|
||||
|
||||
def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
|
||||
page_count = 0
|
||||
extracted_pages = []
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(data))
|
||||
page_count = len(reader.pages)
|
||||
for page in reader.pages:
|
||||
extracted_pages.append(page.extract_text() or "")
|
||||
except Exception:
|
||||
extracted_pages = []
|
||||
|
||||
text = _normalize_text("\n".join(extracted_pages))
|
||||
if len(text) >= 80:
|
||||
return text, False, page_count
|
||||
|
||||
doc = fitz.open(stream=data, filetype="pdf")
|
||||
page_count = doc.page_count
|
||||
ocr_pages = []
|
||||
for page in doc:
|
||||
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
|
||||
image = Image.open(io.BytesIO(pix.tobytes("png")))
|
||||
ocr_pages.append(_ocr_image(image))
|
||||
doc.close()
|
||||
return _normalize_text("\n".join(ocr_pages)), True, page_count
|
||||
|
||||
|
||||
def _extract_docx_text(data: bytes) -> str:
|
||||
document = Document(io.BytesIO(data))
|
||||
parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
|
||||
return _normalize_text("\n".join(parts))
|
||||
|
||||
|
||||
def _extract_plain_text(data: bytes) -> str:
|
||||
return _normalize_text(data.decode("utf-8", errors="ignore"))
|
||||
|
||||
|
||||
@app.post("/extract-text")
|
||||
async def extract_text(file: UploadFile = File(...)):
|
||||
filename = file.filename or "document"
|
||||
extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
|
||||
data = await file.read()
|
||||
if not data:
|
||||
raise HTTPException(status_code=400, detail="The uploaded file was empty.")
|
||||
if len(data) > MAX_EXTRACT_FILE_BYTES:
|
||||
raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
|
||||
|
||||
try:
|
||||
if extension in {".txt", ".md"}:
|
||||
text = _extract_plain_text(data)
|
||||
ocr_used = False
|
||||
page_count = None
|
||||
elif extension == ".docx":
|
||||
text = _extract_docx_text(data)
|
||||
ocr_used = False
|
||||
page_count = None
|
||||
elif extension == ".pdf":
|
||||
text, ocr_used, page_count = _extract_pdf_text(data)
|
||||
elif extension in IMAGE_EXTENSIONS:
|
||||
image = Image.open(io.BytesIO(data))
|
||||
text = _ocr_image(image)
|
||||
ocr_used = True
|
||||
page_count = 1
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
|
||||
|
||||
if not text:
|
||||
raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"ocr_used": ocr_used,
|
||||
"content_type": file.content_type,
|
||||
"page_count": page_count,
|
||||
"characters": len(text),
|
||||
"file_name": filename,
|
||||
}
|
||||
|
||||
@@ -4,3 +4,8 @@ transformers==4.48.3
|
||||
cachetools==5.5.2
|
||||
pydantic==2.10.6
|
||||
torch==2.6.0
|
||||
pillow==11.1.0
|
||||
pytesseract==0.3.13
|
||||
pypdf==5.4.0
|
||||
pymupdf==1.25.5
|
||||
python-docx==1.1.2
|
||||
|
||||
Reference in New Issue
Block a user