"""
Image loading and preprocessing for OCR pipeline.
Handles JPEG, PNG, TIFF, BMP, and multi-page PDF files.
"""

import os
from pathlib import Path
from typing import List, Tuple
from PIL import Image


SUPPORTED_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".webp"}
SUPPORTED_PDF_EXTENSIONS = {".pdf"}
ALL_SUPPORTED = SUPPORTED_IMAGE_EXTENSIONS | SUPPORTED_PDF_EXTENSIONS


def is_supported_file(filepath: str) -> bool:
    ext = Path(filepath).suffix.lower()
    return ext in ALL_SUPPORTED


def load_images_from_file(filepath: str, max_dimension: int = 2048) -> List[Tuple[Image.Image, str]]:
    ext = Path(filepath).suffix.lower()
    if ext in SUPPORTED_PDF_EXTENSIONS:
        return _load_pdf_pages(filepath, max_dimension)
    elif ext in SUPPORTED_IMAGE_EXTENSIONS:
        img = Image.open(filepath).convert("RGB")
        if max_dimension > 0:
            img = _resize_if_needed(img, max_dimension)
        return [(img, "1")]
    else:
        raise ValueError(f"Unsupported file type: {ext}")


def _load_pdf_pages(filepath: str, max_dimension: int) -> List[Tuple[Image.Image, str]]:
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(filepath)
        results = []
        for i, page in enumerate(doc, start=1):
            mat = fitz.Matrix(300 / 72, 300 / 72)
            pix = page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            if max_dimension > 0:
                img = _resize_if_needed(img, max_dimension)
            results.append((img, str(i)))
        doc.close()
        return results
    except ImportError:
        try:
            from pdf2image import convert_from_path
            pil_images = convert_from_path(filepath, dpi=300, fmt="png")
            results = []
            for i, img in enumerate(pil_images, start=1):
                img = img.convert("RGB")
                if max_dimension > 0:
                    img = _resize_if_needed(img, max_dimension)
                results.append((img, str(i)))
            return results
        except ImportError:
            raise ImportError("PDF processing requires PyMuPDF or pdf2image.")


def _resize_if_needed(img: Image.Image, max_dim: int) -> Image.Image:
    w, h = img.size
    if w <= max_dim and h <= max_dim:
        return img
    scale = min(max_dim / w, max_dim / h)
    return img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
