Evolve summarizer into AI service with OCR support
This commit is contained in:
+107
-3
@@ -1,16 +1,25 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
from pydantic import BaseModel, Field
|
||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
from cachetools import TTLCache
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader
|
||||
from docx import Document
|
||||
import fitz
|
||||
import hashlib
|
||||
import io
|
||||
import re
|
||||
import torch
|
||||
import pytesseract
|
||||
|
||||
app = FastAPI(title="Local Summarizer")
|
||||
app = FastAPI(title="Local AI Service")
|
||||
|
||||
MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
|
||||
MAX_INPUT_CHARS = 20000
|
||||
MAX_CONTEXT_CHARS = 2200
|
||||
MAX_EXTRACT_FILE_BYTES = 8 * 1024 * 1024
|
||||
OCR_LANGUAGES = "eng"
|
||||
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
|
||||
|
||||
|
||||
def _load_runtime():
|
||||
@@ -48,6 +57,8 @@ async def health():
|
||||
"device": str(device),
|
||||
"gpu_available": GPU_AVAILABLE,
|
||||
"gpu_name": GPU_NAME,
|
||||
"ocr_available": True,
|
||||
"ocr_languages": OCR_LANGUAGES,
|
||||
}
|
||||
|
||||
|
||||
@@ -68,7 +79,6 @@ _TECH_PRIORITY = [
|
||||
"aws", "azure", "gcp", "terraform", "graphql", "rest", "git",
|
||||
]
|
||||
|
||||
|
||||
_MUST_HAVE_HINTS = [
|
||||
"must have", "required", "requirements", "you have", "you bring", "essential", "we are looking for",
|
||||
]
|
||||
@@ -339,3 +349,97 @@ async def summarize(req: SummarizeRequest):
|
||||
out = "\n".join(lines).strip()
|
||||
cache[key] = out
|
||||
return {"summary": out, "cached": False}
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> str:
|
||||
value = value.replace("\x00", " ")
|
||||
return re.sub(r"\s+", " ", value).strip()
|
||||
|
||||
|
||||
def _ocr_image(image: Image.Image) -> str:
|
||||
if image.mode not in ("RGB", "L"):
|
||||
image = image.convert("RGB")
|
||||
text = pytesseract.image_to_string(image, lang=OCR_LANGUAGES)
|
||||
return _normalize_text(text)
|
||||
|
||||
|
||||
def _extract_pdf_text(data: bytes) -> tuple[str, bool, int]:
|
||||
page_count = 0
|
||||
extracted_pages = []
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(data))
|
||||
page_count = len(reader.pages)
|
||||
for page in reader.pages:
|
||||
extracted_pages.append(page.extract_text() or "")
|
||||
except Exception:
|
||||
extracted_pages = []
|
||||
|
||||
text = _normalize_text("\n".join(extracted_pages))
|
||||
if len(text) >= 80:
|
||||
return text, False, page_count
|
||||
|
||||
doc = fitz.open(stream=data, filetype="pdf")
|
||||
page_count = doc.page_count
|
||||
ocr_pages = []
|
||||
for page in doc:
|
||||
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
|
||||
image = Image.open(io.BytesIO(pix.tobytes("png")))
|
||||
ocr_pages.append(_ocr_image(image))
|
||||
doc.close()
|
||||
return _normalize_text("\n".join(ocr_pages)), True, page_count
|
||||
|
||||
|
||||
def _extract_docx_text(data: bytes) -> str:
|
||||
document = Document(io.BytesIO(data))
|
||||
parts = [p.text.strip() for p in document.paragraphs if p.text and p.text.strip()]
|
||||
return _normalize_text("\n".join(parts))
|
||||
|
||||
|
||||
def _extract_plain_text(data: bytes) -> str:
|
||||
return _normalize_text(data.decode("utf-8", errors="ignore"))
|
||||
|
||||
|
||||
@app.post("/extract-text")
|
||||
async def extract_text(file: UploadFile = File(...)):
|
||||
filename = file.filename or "document"
|
||||
extension = "." + filename.rsplit(".", 1)[1].lower() if "." in filename else ""
|
||||
data = await file.read()
|
||||
if not data:
|
||||
raise HTTPException(status_code=400, detail="The uploaded file was empty.")
|
||||
if len(data) > MAX_EXTRACT_FILE_BYTES:
|
||||
raise HTTPException(status_code=400, detail="The uploaded file is too large for AI extraction.")
|
||||
|
||||
try:
|
||||
if extension in {".txt", ".md"}:
|
||||
text = _extract_plain_text(data)
|
||||
ocr_used = False
|
||||
page_count = None
|
||||
elif extension == ".docx":
|
||||
text = _extract_docx_text(data)
|
||||
ocr_used = False
|
||||
page_count = None
|
||||
elif extension == ".pdf":
|
||||
text, ocr_used, page_count = _extract_pdf_text(data)
|
||||
elif extension in IMAGE_EXTENSIONS:
|
||||
image = Image.open(io.BytesIO(data))
|
||||
text = _ocr_image(image)
|
||||
ocr_used = True
|
||||
page_count = 1
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="This file type is not supported for AI extraction.")
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=500, detail=f"AI extraction failed: {exc}") from exc
|
||||
|
||||
if not text:
|
||||
raise HTTPException(status_code=422, detail="AI extraction did not find readable text in the uploaded file.")
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"ocr_used": ocr_used,
|
||||
"content_type": file.content_type,
|
||||
"page_count": page_count,
|
||||
"characters": len(text),
|
||||
"file_name": filename,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user