Evolve summarizer into AI service with OCR support

2026-03-23 20:12:34 +01:00
parent 90fdd8e1a5
commit 653f713a78
20 changed files with 475 additions and 129 deletions
@@ -1,16 +1,25 @@
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, File, HTTPException, UploadFile
 from pydantic import BaseModel, Field
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from cachetools import TTLCache
+from PIL import Image
+from pypdf import PdfReader
+from docx import Document
+import fitz
 import hashlib
+import io
 import re
 import torch
+import pytesseract

-app = FastAPI(title="Local Summarizer")
+app = FastAPI(title="Local AI Service")

 MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
 MAX_INPUT_CHARS = 20000
 MAX_CONTEXT_CHARS = 2200
+MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
+OCR_LANGUAGES = "eng"
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}


 def _load_runtime():
@@ -48,6 +57,8 @@ async def health():
        "device": str(device),
        "gpu_available": GPU_AVAILABLE,
        "gpu_name": GPU_NAME,
+        "ocr_available": True,
+        "ocr_languages": OCR_LANGUAGES,
    }


@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
    "aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
 ]

-
 _MUST_HAVE_HINTS = [
    "must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
 ]
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
    out = "\n".join(lines).strip()
    cache[key] = out
    return {"summary": out, "cached": False}
+
+
+def _normalize_text(value: str) -> str:
+    value = value.replace("\x00", " ")
+    return re.sub(r"\s+", " ", value).strip()
+
+
+def _ocr_image(image: Image.Image) -> str:
+    if image.mode not in ("RGB", "L"):
+        image = image.convert("RGB")
+    text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
+    return _normalize_text(text)
+
+
+def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
+    page_count = 0
+    extracted_pages = []
+    try:
+        reader = PdfReader(io.BytesIO(data))
+        page_count = len(reader.pages)
+        for page in reader.pages:
+            extracted_pages.append(page.extract_text() or "")
+    except Exception:
+        extracted_pages = []
+
+    text = _normalize_text("\n".join(extracted_pages))
+    if len(text) >= 80:
+        return text, False, page_count
+
+    doc = fitz.open(stream=data, filetype="pdf")
+    page_count = doc.page_count
+    ocr_pages = []
+    for page in doc:
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
+        image = Image.open(io.BytesIO(pix.tobytes("png")))
+        ocr_pages.append(_ocr_image(image))
+    doc.close()
+    return _normalize_text("\n".join(ocr_pages)), True, page_count
+
+
+def _extract_docx_text(data: bytes) -> str:
+    document = Document(io.BytesIO(data))
+    parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
+    return _normalize_text("\n".join(parts))
+
+
+def _extract_plain_text(data: bytes) -> str:
+    return _normalize_text(data.decode("utf-8", errors="ignore"))
+
+
+@app.post("/extract-text")
+async def extract_text(file: UploadFile = File(...)):
+    filename = file.filename or "document"
+    extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
+    data = await file.read()
+    if not data:
+        raise HTTPException(status_code=400, detail="The uploaded file was empty.")
+    if len(data) > MAX_EXTRACT_FILE_BYTES:
+        raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
+
+    try:
+        if extension in {".txt", ".md"}:
+            text = _extract_plain_text(data)
+            ocr_used = False
+            page_count = None
+        elif extension == ".docx":
+            text = _extract_docx_text(data)
+            ocr_used = False
+            page_count = None
+        elif extension == ".pdf":
+            text, ocr_used, page_count = _extract_pdf_text(data)
+        elif extension in IMAGE_EXTENSIONS:
+            image = Image.open(io.BytesIO(data))
+            text = _ocr_image(image)
+            ocr_used = True
+            page_count = 1
+        else:
+            raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
+    except HTTPException:
+        raise
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
+
+    if not text:
+        raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
+
+    return {
+        "text": text,
+        "ocr_used": ocr_used,
+        "content_type": file.content_type,
+        "page_count": page_count,
+        "characters": len(text),
+        "file_name": filename,
+    }