Evolve summarizer into AI service with OCR support

2026-03-23 20:12:34 +01:00
parent 90fdd8e1a5
commit 653f713a78
20 changed files with 475 additions and 129 deletions
@@ -5,6 +5,9 @@ ENV PIP_NO_CACHE_DIR=1 \
    TRANSFORMERS_NO_TF=1 \
    HF_HUB_DISABLE_TELEMETRY=1
 WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends tesseract-ocr tesseract-ocr-eng \
+    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
 RUN python -m pip install --upgrade pip setuptools wheel \
    && python -m pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
@@ -1,16 +1,22 @@
-# Local Hugging Face Summarizer
+# Local AI Service

-This small service runs a Hugging Face summarization model locally and exposes a simple HTTP API.
+This service runs a local Hugging Face summarization model and also exposes document text extraction with OCR for supported PDFs and images.

-Install (recommended: virtualenv)
+## Capabilities
+- job/role summarization
+- PDF text extraction
+- OCR fallback for scanned PDFs
+- OCR for image uploads (`png`, `jpg`, `jpeg`, `webp`)
+- DOCX / TXT / MD extraction

-Windows (CPU PyTorch wheel may be required):
+## Install
+
+Windows:

 ```powershell
 python -m venv .venv
 .\.venv\Scripts\Activate.ps1
 pip install -r requirements.txt
-# If torch wheel installation is needed, follow instructions at https://pytorch.org
 python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
 ```

@@ -23,10 +29,15 @@ pip install -r requirements.txt
 python -m uvicorn app:app --host 127.0.0.1 --port 8001 --workers 1
 ```

-API
- `GET /health` — health check
- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }` returns `{ "summary": "...", "cached": false }`
+## Docker
+The Dockerfile installs Tesseract OCR so scanned PDFs and supported images can be processed inside the container.

-Notes
- Model will be downloaded on first run and can be several hundred MB.
- For lower memory usage, consider `sshleifer/tiny-distilbart-cnn-6-6` or `t5-small`.
+## API
+- `GET /health` — health check and runtime capabilities
+- `POST /summarize` — JSON body `{ "text": "...", "max_length": 150, "min_length": 30 }`
+- `POST /extract-text` — multipart file upload, returns extracted text and OCR metadata
+
+## Notes
+- Model weights are downloaded on first run.
+- OCR quality depends on scan quality and language support.
+- Default OCR language is English (`eng`).
@@ -1,16 +1,25 @@
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, File, HTTPException, UploadFile
 from pydantic import BaseModel, Field
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from cachetools import TTLCache
+from PIL import Image
+from pypdf import PdfReader
+from docx import Document
+import fitz
 import hashlib
+import io
 import re
 import torch
+import pytesseract

-app = FastAPI(title="Local Summarizer")
+app = FastAPI(title="Local AI Service")

 MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
 MAX_INPUT_CHARS = 20000
 MAX_CONTEXT_CHARS = 2200
+MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
+OCR_LANGUAGES = "eng"
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}


 def _load_runtime():
@@ -48,6 +57,8 @@ async def health():
        "device": str(device),
        "gpu_available": GPU_AVAILABLE,
        "gpu_name": GPU_NAME,
+        "ocr_available": True,
+        "ocr_languages": OCR_LANGUAGES,
    }


@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
    "aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
 ]

-
 _MUST_HAVE_HINTS = [
    "must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
 ]
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
    out = "\n".join(lines).strip()
    cache[key] = out
    return {"summary": out, "cached": False}
+
+
+def _normalize_text(value: str) -> str:
+    value = value.replace("\x00", " ")
+    return re.sub(r"\s+", " ", value).strip()
+
+
+def _ocr_image(image: Image.Image) -> str:
+    if image.mode not in ("RGB", "L"):
+        image = image.convert("RGB")
+    text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
+    return _normalize_text(text)
+
+
+def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
+    page_count = 0
+    extracted_pages = []
+    try:
+        reader = PdfReader(io.BytesIO(data))
+        page_count = len(reader.pages)
+        for page in reader.pages:
+            extracted_pages.append(page.extract_text() or "")
+    except Exception:
+        extracted_pages = []
+
+    text = _normalize_text("\n".join(extracted_pages))
+    if len(text) >= 80:
+        return text, False, page_count
+
+    doc = fitz.open(stream=data, filetype="pdf")
+    page_count = doc.page_count
+    ocr_pages = []
+    for page in doc:
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
+        image = Image.open(io.BytesIO(pix.tobytes("png")))
+        ocr_pages.append(_ocr_image(image))
+    doc.close()
+    return _normalize_text("\n".join(ocr_pages)), True, page_count
+
+
+def _extract_docx_text(data: bytes) -> str:
+    document = Document(io.BytesIO(data))
+    parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
+    return _normalize_text("\n".join(parts))
+
+
+def _extract_plain_text(data: bytes) -> str:
+    return _normalize_text(data.decode("utf-8", errors="ignore"))
+
+
+@app.post("/extract-text")
+async def extract_text(file: UploadFile = File(...)):
+    filename = file.filename or "document"
+    extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
+    data = await file.read()
+    if not data:
+        raise HTTPException(status_code=400, detail="The uploaded file was empty.")
+    if len(data) > MAX_EXTRACT_FILE_BYTES:
+        raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
+
+    try:
+        if extension in {".txt", ".md"}:
+            text = _extract_plain_text(data)
+            ocr_used = False
+            page_count = None
+        elif extension == ".docx":
+            text = _extract_docx_text(data)
+            ocr_used = False
+            page_count = None
+        elif extension == ".pdf":
+            text, ocr_used, page_count = _extract_pdf_text(data)
+        elif extension in IMAGE_EXTENSIONS:
+            image = Image.open(io.BytesIO(data))
+            text = _ocr_image(image)
+            ocr_used = True
+            page_count = 1
+        else:
+            raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
+    except HTTPException:
+        raise
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
+
+    if not text:
+        raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
+
+    return {
+        "text": text,
+        "ocr_used": ocr_used,
+        "content_type": file.content_type,
+        "page_count": page_count,
+        "characters": len(text),
+        "file_name": filename,
+    }
@@ -4,3 +4,8 @@ transformers==4.48.3
 cachetools==5.5.2
 pydantic==2.10.6
 torch==2.6.0
+pillow==11.1.0
+pytesseract==0.3.13
+pypdf==5.4.0
+pymupdf==1.25.5
+python-docx==1.1.2