"""
OCR-to-TEI Pipeline Orchestrator.

Revised to:
- Accept a pre-loaded OCREngine (model stays resident)
- Support metadata inference from first page
- Support custom user tags
- Return OCR results separately so XML can be regenerated after metadata edits
"""

from typing import Dict, List, Optional, Callable
from pathlib import Path
from .image_loader import load_images_from_file, is_supported_file
from .ocr_engine import OCREngine
from .tei_generator import TEIMarkupGenerator
from .tei_schema import validate_tei_structure


class PipelineConfig:
    def __init__(self):
        self.model_key: str = OCREngine.DEFAULT_MODEL
        self.genre: str = "auto"
        self.max_image_dimension: int = 0  # 0 = let engine decide per-device
        self.metadata: Dict = {}
        self.cache_dir: Optional[str] = None
        self.custom_tags: List[Dict] = []  # [{"name": "PERSON", "description": "..."}]


class PipelineStatus:
    def __init__(self):
        self.current_step: str = ""
        self.current_page: int = 0
        self.total_pages: int = 0
        self.progress_fraction: float = 0.0
        self.is_complete: bool = False
        self.has_error: bool = False
        self.error_message: str = ""
        self.result_xml: Optional[str] = None
        self.validation_errors: List[str] = []


class OCRTEIPipeline:
    def __init__(self, config: PipelineConfig = None, engine: OCREngine = None):
        self.config = config or PipelineConfig()
        self.status = PipelineStatus()
        # Use shared engine if provided, else create a new one
        if engine is not None:
            self.engine = engine
            self._owns_engine = False
        else:
            self.engine = OCREngine(
                model_key=self.config.model_key,
                cache_dir=self.config.cache_dir)
            self._owns_engine = True
        self.generator = TEIMarkupGenerator()
        self._cancel_requested = False
        # Store OCR results so XML can be regenerated with new metadata
        self.ocr_results: List = []
        self.inferred_metadata: Dict = {}

    def cancel(self):
        self._cancel_requested = True

    def run(self, filepath: str, progress_callback: Optional[Callable] = None) -> str:
        self._cancel_requested = False
        self.status = PipelineStatus()

        def update(msg=None):
            if msg:
                self.status.current_step = msg
            if progress_callback:
                progress_callback(self.status)

        try:
            update("Validating input file...")
            if not is_supported_file(filepath):
                raise ValueError(f"Unsupported file type: {Path(filepath).suffix}")

            # Load images — use engine's per-device max dimension
            update("Loading images...")
            max_dim = self.config.max_image_dimension or self.engine.get_max_image_dim()
            pages = load_images_from_file(filepath, max_dim)
            self.status.total_pages = len(pages)
            update(f"Loaded {len(pages)} page(s).")

            if self._cancel_requested:
                raise InterruptedError("Cancelled.")

            # Ensure model loaded
            update("Loading OCR model...")
            self.engine.load_model(progress_callback=lambda msg: update(msg))

            if self._cancel_requested:
                raise InterruptedError("Cancelled.")

            # Infer metadata from first page
            self.status.progress_fraction = 0.05
            update("Inferring bibliographic metadata...")
            first_images = [img for img, _ in pages[:min(2, len(pages))]]
            self.inferred_metadata = self.engine.infer_metadata(first_images)

            if self._cancel_requested:
                raise InterruptedError("Cancelled.")

            # OCR all pages
            def ocr_progress(msg, page_num, total):
                self.status.current_step = msg
                self.status.current_page = page_num
                self.status.progress_fraction = 0.1 + (page_num / total * 0.7)
                if progress_callback:
                    progress_callback(self.status)

            self.ocr_results = self.engine.process_pages(
                pages, genre=self.config.genre,
                custom_tags=self.config.custom_tags,
                progress_callback=ocr_progress)

            if self._cancel_requested:
                raise InterruptedError("Cancelled.")

            # Build TEI XML — merge inferred metadata with user-provided
            merged_meta = {**self.inferred_metadata, **{k: v for k, v in self.config.metadata.items() if v}}
            return self._generate_xml(merged_meta, filepath, update)

        except InterruptedError as e:
            self.status.has_error = True
            self.status.error_message = str(e)
            raise
        except Exception as e:
            self.status.has_error = True
            self.status.error_message = str(e)
            update(f"Error: {e}")
            raise

    def regenerate_xml(self, metadata: Dict, filepath: str = "") -> str:
        """Regenerate TEI XML from stored OCR results with updated metadata."""
        if not self.ocr_results:
            raise RuntimeError("No OCR results available. Run the pipeline first.")
        return self._generate_xml(metadata, filepath)

    def _generate_xml(self, metadata: Dict, filepath: str = "", update=None) -> str:
        if update:
            self.status.progress_fraction = 0.85
            update("Generating TEI XML markup...")

        source_filename = Path(filepath).stem if filepath else ""
        xml_string = self.generator.generate_tei(
            self.ocr_results, metadata=metadata,
            genre=self.config.genre, source_filename=source_filename)

        if update:
            self.status.progress_fraction = 0.95
            update("Validating TEI XML...")

        is_valid, errors = validate_tei_structure(xml_string)
        self.status.validation_errors = errors

        self.status.progress_fraction = 1.0
        self.status.is_complete = True
        self.status.result_xml = xml_string
        if update:
            update("Processing complete.")

        return xml_string
