Evolve summarizer into AI service with OCR support

This commit is contained in:
cesnimda
2026-03-23 20:12:34 +01:00
parent 90fdd8e1a5
commit 653f713a78
20 changed files with 475 additions and 129 deletions
+3
View File
@@ -5,6 +5,9 @@ ENV PIP_NO_CACHE_DIR=1 \
TRANSFORMERS_NO_TF=1 \
HF_HUB_DISABLE_TELEMETRY=1
WORKDIR /app
RUN apt-get update \
&& apt-get install -y --no-install-recommends tesseract-ocr tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt ./
RUN python -m pip install --upgrade pip setuptools wheel \
&& python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
+22 -11
View File
@@ -1,16 +1,22 @@
# Local Hugging Face Summarizer
# Local AI Service
This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API.
This service runs a local Hugging Face summarization model and also exposes document text extraction with OCR for supported PDFs and images.
Install (recommended: virtualenv)
## Capabilities
- job/role summarization
- PDF text extraction
- OCR fallback for scanned PDFs
- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
- DOCX / TXT / MD extraction
Windows (CPU PyTorch wheel may be required):
## Install
Windows:
```powershell
python -m venv .venv
.\.venv\Scripts\Activate.ps1
pip install -r requirements.txt
# If torch wheel installation is needed, follow instructions at https://pytorch.org
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
```
@@ -23,10 +29,15 @@ pip install -r requirements.txt
python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
```
API
- `GET /health` — health check
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }`
## Docker
The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can be processed inside the container.
Notes
- Model will be downloaded on first run and can be several hundred MB.
- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`.
## API
- `GET /health` — health check and runtime capabilities
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
## Notes
- Model weights are downloaded on first run.
- OCR quality depends on scan quality and language support.
- Default OCR language is English (`eng`).
+107 -3
View File
@@ -1,16 +1,25 @@
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, File, HTTPException, UploadFile
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from cachetools import TTLCache
from PIL import Image
from pypdf import PdfReader
from docx import Document
import fitz
import hashlib
import io
import re
import torch
import pytesseract
app = FastAPI(title="Local Summarizer")
app = FastAPI(title="Local AI Service")
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
MAX_INPUT_CHARS = 20000
MAX_CONTEXT_CHARS = 2200
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
OCR_LANGUAGES = "eng"
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
def _load_runtime():
@@ -48,6 +57,8 @@ async def health():
"device": str(device),
"gpu_available": GPU_AVAILABLE,
"gpu_name": GPU_NAME,
"ocr_available": True,
"ocr_languages": OCR_LANGUAGES,
}
@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
"aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
]
_MUST_HAVE_HINTS = [
"must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
]
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
out = "\n".join(lines).strip()
cache[key] = out
return {"summary": out, "cached": False}
def _normalize_text(value: str) -> str:
value = value.replace("\x00", " ")
return re.sub(r"\s+", " ", value).strip()
def _ocr_image(image: Image.Image) -> str:
if image.mode not in ("RGB", "L"):
image = image.convert("RGB")
text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
return _normalize_text(text)
def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
page_count = 0
extracted_pages = []
try:
reader = PdfReader(io.BytesIO(data))
page_count = len(reader.pages)
for page in reader.pages:
extracted_pages.append(page.extract_text() or "")
except Exception:
extracted_pages = []
text = _normalize_text("\n".join(extracted_pages))
if len(text) >= 80:
return text, False, page_count
doc = fitz.open(stream=data, filetype="pdf")
page_count = doc.page_count
ocr_pages = []
for page in doc:
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
image = Image.open(io.BytesIO(pix.tobytes("png")))
ocr_pages.append(_ocr_image(image))
doc.close()
return _normalize_text("\n".join(ocr_pages)), True, page_count
def _extract_docx_text(data: bytes) -> str:
document = Document(io.BytesIO(data))
parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
return _normalize_text("\n".join(parts))
def _extract_plain_text(data: bytes) -> str:
return _normalize_text(data.decode("utf-8", errors="ignore"))
@app.post("/extract-text")
async def extract_text(file: UploadFile = File(...)):
filename = file.filename or "document"
extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
data = await file.read()
if not data:
raise HTTPException(status_code=400, detail="The uploaded file was empty.")
if len(data) > MAX_EXTRACT_FILE_BYTES:
raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
try:
if extension in {".txt", ".md"}:
text = _extract_plain_text(data)
ocr_used = False
page_count = None
elif extension == ".docx":
text = _extract_docx_text(data)
ocr_used = False
page_count = None
elif extension == ".pdf":
text, ocr_used, page_count = _extract_pdf_text(data)
elif extension in IMAGE_EXTENSIONS:
image = Image.open(io.BytesIO(data))
text = _ocr_image(image)
ocr_used = True
page_count = 1
else:
raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
except HTTPException:
raise
except Exception as exc:
raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
if not text:
raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
return {
"text": text,
"ocr_used": ocr_used,
"content_type": file.content_type,
"page_count": page_count,
"characters": len(text),
"file_name": filename,
}
+5
View File
@@ -4,3 +4,8 @@ transformers==4.48.3
cachetools==5.5.2
pydantic==2.10.6
torch==2.6.0
pillow==11.1.0
pytesseract==0.3.13
pypdf==5.4.0
pymupdf==1.25.5
python-docx==1.1.2