"""
TEI Markup Generator — v2

Converts OCR output (with simplified structural markers) into TEI P5 XML.

SOLUTION 2: Robust cleanup layer (clean_raw_output) runs before parsing:
- Strips markdown code fences (```plaintext, ```)
- Strips model preamble ("Here is the transcription...")
- Normalizes malformed tags and legacy bracket-style tags
- Removes duplicate running headers that appear as plain text
- Strips orphaned closing tags

SOLUTION 3: Improved front-matter heuristic:
- Only classifies page as front matter if heading area >> body text
- Splits heading from body within a page, rather than dumping whole page

Coordinated with Solution 1's simplified markers:
  ## heading ##, RH: ..., PN: ..., blank lines = paragraphs,
  _italic_, **bold**, FN{...}, SC{...}, etc.
"""

import re
from typing import List, Tuple, Dict, Optional
from lxml import etree
from .tei_schema import (
    TEI_NS, XML_NS, NSMAP,
    make_element, create_tei_skeleton, serialize_tei,
    validate_tei_structure, repair_tei_xml
)


# ── Solution 2: Cleanup Layer ──────────────────────────────────────────

def clean_raw_output(text: str) -> str:
    """Clean VLM output before structural parsing.

    Handles all known artifact categories:
    - Markdown code fences
    - Model preamble / postamble
    - Legacy bracket-style tags from old prompts
    - Malformed / orphaned tags
    - Duplicate headers appearing as plain text
    """
    if not text:
        return ""

    # 1. Strip markdown code fences
    #    Handles: ```plaintext\n...\n```, ```\n...\n```, ```json, etc.
    text = re.sub(r'^```\w*\s*\n?', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n?```\s*$', '', text, flags=re.MULTILINE)
    # Also catch inline code fences
    text = text.replace('```plaintext', '').replace('```', '')

    # 2. Strip model preamble
    #    VLMs sometimes output "Here is the transcription:" or similar before content.
    preamble_patterns = [
        r'^(?:Here\s+is\s+the\s+transcription[:\s]*\n?)',
        r'^(?:The\s+transcription\s+(?:is|of)[:\s]*\n?)',
        r'^(?:Transcription[:\s]*\n?)',
        r'^(?:Output[:\s]*\n?)',
        r'^(?:Below\s+is[:\s]*\n?)',
    ]
    for pat in preamble_patterns:
        text = re.sub(pat, '', text, count=1, flags=re.IGNORECASE)

    # 3. Normalize legacy bracket-style tags to new format.
    #    If the model was prompted with old-style tags or generates them anyway,
    #    convert them to the simplified format.
    #    These conversions ensure backward compatibility.

    # Headings: [HEADING]...[/HEADING] → ## ... ##
    text = re.sub(r'\[HEADING\]\s*(.*?)\s*\[/HEADING\]',
                  lambda m: f'## {m.group(1).strip()} ##', text, flags=re.DOTALL)

    # Running header: [RUNNING_HEADER]...[/RUNNING_HEADER] → RH: ...
    text = re.sub(r'\[RUNNING_HEADER\]\s*(.*?)\s*\[/RUNNING_HEADER\]',
                  lambda m: f'RH: {m.group(1).strip()}', text, flags=re.DOTALL)

    # Running footer: [RUNNING_FOOTER]...[/RUNNING_FOOTER] → remove (usually not needed)
    text = re.sub(r'\[RUNNING_FOOTER\]\s*(.*?)\s*\[/RUNNING_FOOTER\]', '', text, flags=re.DOTALL)

    # Page number: [PAGE_NUMBER]...[/PAGE_NUMBER] → PN: ...
    # Also handle malformed [PAGE_NUMBER] ... [PAGE_NUMBER] (missing slash)
    text = re.sub(r'\[PAGE_NUMBER\]\s*(.*?)\s*\[/PAGE_NUMBER\]',
                  lambda m: f'PN: {m.group(1).strip()}', text, flags=re.DOTALL)
    text = re.sub(r'\[PAGE_NUMBER\]\s*(\d+)\s*\[PAGE_NUMBER\]',
                  lambda m: f'PN: {m.group(1).strip()}', text)

    # Paragraph: [PARAGRAPH] → blank line (+ strip [/PARAGRAPH])
    text = re.sub(r'\[/?PARAGRAPH\]', '\n', text)

    # Footnote: [FOOTNOTE]...[/FOOTNOTE] → FN{...}
    text = re.sub(r'\[FOOTNOTE\]\s*(.*?)\s*\[/FOOTNOTE\]',
                  lambda m: f'FN{{{m.group(1).strip()}}}', text, flags=re.DOTALL)

    # Margin note: [MARGIN_NOTE]...[/MARGIN_NOTE] → MN{...}
    text = re.sub(r'\[MARGIN_NOTE\]\s*(.*?)\s*\[/MARGIN_NOTE\]',
                  lambda m: f'MN{{{m.group(1).strip()}}}', text, flags=re.DOTALL)

    # Italic: [ITALIC]...[/ITALIC] → _..._
    text = re.sub(r'\[ITALIC\]\s*(.*?)\s*\[/ITALIC\]',
                  lambda m: f'_{m.group(1)}_', text, flags=re.DOTALL)

    # Bold: [BOLD]...[/BOLD] → **...**
    text = re.sub(r'\[BOLD\]\s*(.*?)\s*\[/BOLD\]',
                  lambda m: f'**{m.group(1)}**', text, flags=re.DOTALL)

    # Small caps: [SMALLCAPS]...[/SMALLCAPS] → SC{...}
    text = re.sub(r'\[SMALLCAPS\]\s*(.*?)\s*\[/SMALLCAPS\]',
                  lambda m: f'SC{{{m.group(1).strip()}}}', text, flags=re.DOTALL)

    # Blackletter: [BLACKLETTER]...[/BLACKLETTER] → BL{...}
    text = re.sub(r'\[BLACKLETTER\]\s*(.*?)\s*\[/BLACKLETTER\]',
                  lambda m: f'BL{{{m.group(1).strip()}}}', text, flags=re.DOTALL)

    # Decorative initial: [DECORATIVE_INITIAL] X [/DECORATIVE_INITIAL] → DI{X}
    text = re.sub(r'\[DECORATIVE_INITIAL\]\s*(\w)\s*\[/DECORATIVE_INITIAL\]',
                  lambda m: f'DI{{{m.group(1)}}}', text)

    # Figure: [FIGURE]...[/FIGURE] → FIG{...}
    text = re.sub(r'\[FIGURE\]\s*(.*?)\s*\[/FIGURE\]',
                  lambda m: f'FIG{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[FIGURE\]', '', text)

    # Catchword: [CATCHWORD]...[/CATCHWORD] → CW{...}
    text = re.sub(r'\[CATCHWORD\]\s*(.*?)\s*\[/CATCHWORD\]',
                  lambda m: f'CW{{{m.group(1).strip()}}}', text, flags=re.DOTALL)

    # Column break
    text = text.replace('[COLUMN_BREAK]', '\n')

    # Poetry legacy tags
    text = re.sub(r'\[STANZA\]\s*(.*?)\s*\[/STANZA\]',
                  lambda m: f'\n{m.group(1).strip()}\n', text, flags=re.DOTALL)
    text = re.sub(r'\[LINE_INDENT\]\s*(.*?)\s*\[/LINE_INDENT\]',
                  lambda m: f'>>{m.group(1).strip()}', text, flags=re.DOTALL)
    text = re.sub(r'\[LINE\]\s*(.*?)\s*\[/LINE\]',
                  lambda m: m.group(1).strip(), text, flags=re.DOTALL)

    # Drama legacy tags
    text = re.sub(r'\[SPEAKER\]\s*(.*?)\s*\[/SPEAKER\]',
                  lambda m: f'SP{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[SPEECH\]\s*(.*?)\s*\[/SPEECH\]',
                  lambda m: m.group(1).strip(), text, flags=re.DOTALL)
    text = re.sub(r'\[STAGE_DIRECTION\]\s*(.*?)\s*\[/STAGE_DIRECTION\]',
                  lambda m: f'SD{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[ACT\]\s*(.*?)\s*\[/ACT\]',
                  lambda m: f'## {m.group(1).strip()} ##', text, flags=re.DOTALL)
    text = re.sub(r'\[SCENE\]\s*(.*?)\s*\[/SCENE\]',
                  lambda m: f'## {m.group(1).strip()} ##', text, flags=re.DOTALL)

    # Manuscript legacy tags
    text = re.sub(r'\[DELETION\]\s*(.*?)\s*\[/DELETION\]',
                  lambda m: f'DEL{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[ADDITION\]\s*(.*?)\s*\[/ADDITION\]',
                  lambda m: f'ADD{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[UNCLEAR\]\s*(.*?)\s*\[/UNCLEAR\]',
                  lambda m: f'UNC{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[ILLEGIBLE\]', '[illegible]', text)
    text = re.sub(r'\[SALUTATION\]\s*(.*?)\s*\[/SALUTATION\]',
                  lambda m: f'SAL{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[CLOSING\]\s*(.*?)\s*\[/CLOSING\]',
                  lambda m: f'CLOSE{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[SIGNED\]\s*(.*?)\s*\[/SIGNED\]',
                  lambda m: f'SIGN{{{m.group(1).strip()}}}', text, flags=re.DOTALL)
    text = re.sub(r'\[DATELINE\]\s*(.*?)\s*\[/DATELINE\]',
                  lambda m: f'DATE{{{m.group(1).strip()}}}', text, flags=re.DOTALL)

    # 4. Strip any remaining orphaned bracket tags
    #    Catches [/ANYTHING] or [ANYTHING] that wasn't handled above
    text = re.sub(r'\[/?[A-Z_]+\]', '', text)

    # Also handle bracket-priming artifact: lines wrapped in [...]
    # where the model wrapped every line in square brackets.
    # Only strip if the line is ENTIRELY wrapped (starts with [ ends with ])
    # and doesn't contain other markup.
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        stripped = line.strip()
        if (stripped.startswith('[') and stripped.endswith(']')
            and stripped.count('[') == 1 and stripped.count(']') == 1
            and not stripped.startswith('[illegible]')):
            cleaned_lines.append(stripped[1:-1].strip())
        else:
            cleaned_lines.append(line)
    text = '\n'.join(cleaned_lines)

    # 5. Collapse excessive blank lines (3+ → 2)
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()


# ── TEI Generator ──────────────────────────────────────────────────────

class TEIMarkupGenerator:
    """Generates TEI P5 XML from OCR text using the simplified marker format."""

    def __init__(self):
        pass

    def generate_tei(
        self,
        pages: List[Tuple[str, str]],
        metadata: Dict,
        genre: str = "auto",
        source_filename: str = None,
    ) -> str:
        # Clean all pages first (Solution 2)
        cleaned_pages = [(clean_raw_output(text), label) for text, label in pages]

        # Create skeleton
        tei = create_tei_skeleton(metadata)
        text_el = tei.find(f"{{{TEI_NS}}}text")

        # Detect genre if auto
        if genre == "auto":
            genre = self._detect_genre(cleaned_pages)

        # Solution 3: Improved front-matter detection
        front_content, body_pages = self._extract_front_matter(cleaned_pages)

        if front_content:
            front = make_element("front", parent=text_el)
            div = make_element("div", attrib={"type": "title"}, parent=front)
            if body_pages:
                # Add page break for first page
                self._add_page_break(div, cleaned_pages[0][1], source_filename)
            for heading_text in front_content:
                make_element("head", parent=div).text = heading_text

        # Build body
        body = make_element("body", parent=text_el)

        if genre == "poetry":
            self._build_poetry_body(body, body_pages, source_filename)
        elif genre == "drama":
            self._build_drama_body(body, body_pages, source_filename)
        elif genre == "manuscript":
            self._build_manuscript_body(body, body_pages, source_filename)
        else:
            self._build_prose_body(body, body_pages, source_filename)

        # Serialize
        xml_string = serialize_tei(tei)

        is_valid, errors = validate_tei_structure(xml_string)
        if not is_valid:
            xml_string = repair_tei_xml(xml_string)

        return xml_string

    # ── Genre Detection ──────────────────────────────────────────────

    def _detect_genre(self, pages: List[Tuple[str, str]]) -> str:
        all_text = " ".join(t for t, _ in pages)

        drama_score = sum(1 for m in ["SP{", "SD{"] if m in all_text)
        poetry_score = 0
        manuscript_score = sum(1 for m in ["DEL{", "ADD{", "UNC{", "SAL{", "CLOSE{"]
                               if m in all_text)

        # Poetry heuristic: many short lines across BODY pages (not title pages).
        # Distinguish title pages from poetry:
        # - Title pages: very short lines (avg < 35 chars), few lines per block (1-3),
        #   mostly standalone phrases
        # - Poetry: moderate line lengths (20-60 chars), 4+ lines per stanza block,
        #   consistent verse structure
        poetry_pages = 0
        for text, _ in pages:
            lines = [l for l in text.split('\n') if l.strip()]
            if not lines:
                continue

            # Strip RH/PN lines from analysis
            content_lines = [l for l in lines
                             if not l.strip().startswith('RH:')
                             and not l.strip().startswith('PN:')
                             and not l.strip().startswith('##')]

            if len(content_lines) < 4:
                continue

            short_lines = sum(1 for l in content_lines if 5 < len(l.strip()) < 65)
            avg_len = sum(len(l.strip()) for l in content_lines) / len(content_lines)

            # Check for title page pattern: very short avg line length
            # and few lines per blank-line-separated block
            blocks = [b.strip() for b in re.split(r'\n\s*\n', text) if b.strip()]
            avg_lines_per_block = len(content_lines) / max(len(blocks), 1)

            is_title_like = (avg_len < 35 and avg_lines_per_block < 4
                             and len(text) < 600)

            if (not is_title_like
                    and short_lines > len(content_lines) * 0.5
                    and len(content_lines) > 4):
                poetry_pages += 1

        if poetry_pages > 0:
            poetry_score = 3

        scores = {"drama": drama_score, "poetry": poetry_score,
                  "manuscript": manuscript_score, "prose": 0}
        best = max(scores, key=scores.get)
        return best if scores[best] >= 2 else "prose"

    # ── Solution 3: Front-Matter Detection ───────────────────────────

    def _extract_front_matter(self, pages: List[Tuple[str, str]]):
        """Extract front-matter headings from the first page if it's title-like.

        Returns (front_headings, body_pages) where front_headings is a list
        of heading strings (may be empty) and body_pages is the page list
        for the body (possibly with the first page's body text reinserted).

        HEURISTIC: A page is front matter if EITHER:
        A) It has ## heading ## markers and little body text after them, OR
        B) It looks like a title page: all short lines, no long prose paragraphs,
           and short total length (< 500 chars of content).
        """
        if not pages:
            return [], pages

        first_text, first_label = pages[0]

        # Path A: Explicit ## headings ##
        headings = re.findall(r'##\s*(.*?)\s*##', first_text, re.DOTALL)

        if headings:
            # Remove ONLY headings to get remaining text (keep RH/PN lines)
            remaining_text = re.sub(r'##.*?##', '', first_text, flags=re.DOTALL).strip()

            # Measure body text (excluding RH/PN lines)
            body_only = re.sub(r'^RH:.*$', '', remaining_text, flags=re.MULTILINE)
            body_only = re.sub(r'^PN:.*$', '', body_only, flags=re.MULTILINE)
            body_only = body_only.strip()

            para_blocks = [p.strip() for p in re.split(r'\n\s*\n', body_only) if p.strip()]

            if len(para_blocks) <= 1 and len(body_only) < 300:
                # True title page
                # Normalize heading whitespace
                headings = [re.sub(r'\s+', ' ', h).strip() for h in headings]
                return headings, pages[1:]
            else:
                # Chapter opening: heading + body
                headings = [re.sub(r'\s+', ' ', h).strip() for h in headings]
                modified_first = (remaining_text, first_label)
                return headings, [modified_first] + list(pages[1:])

        # Path B: No explicit headings — check if it looks like a title page
        # (all short lines, no long prose blocks, centered-text pattern,
        #  mostly uppercase/title-case starts — NOT lowercase prose)
        content = re.sub(r'^RH:.*$', '', first_text, flags=re.MULTILINE)
        content = re.sub(r'^PN:.*$', '', content, flags=re.MULTILINE)
        content = re.sub(r'CW\{.*?\}', '', content).strip()

        if content:
            lines = [l.strip() for l in content.split('\n') if l.strip()]
            blocks = [b.strip() for b in re.split(r'\n\s*\n', content) if b.strip()]

            all_short = all(len(l) < 70 for l in lines) if lines else False
            no_long_blocks = all(len(b) < 200 for b in blocks) if blocks else False
            short_total = len(content) < 500

            # Most lines should start with an uppercase letter (title-case text)
            # This prevents prose that starts mid-sentence from being caught
            upper_starts = sum(1 for l in lines if l and l[0].isupper())
            mostly_upper = upper_starts >= len(lines) * 0.7 if lines else False

            if (all_short and no_long_blocks and short_total
                    and mostly_upper and len(lines) >= 3):
                # This looks like a title page — treat all lines as headings
                title_headings = []
                for block in blocks:
                    merged = ' '.join(l.strip() for l in block.split('\n') if l.strip())
                    if merged:
                        title_headings.append(merged)
                return title_headings, pages[1:]

        return [], pages

    # ── Page Parsing ─────────────────────────────────────────────────

    def _add_page_break(self, parent, label, source_filename):
        attrib = {}
        if label:
            attrib["n"] = label
        if source_filename:
            attrib["facs"] = f"{source_filename}:{label}"
        make_element("pb", attrib=attrib, parent=parent)

    def _extract_forme_work(self, parent, text):
        """Extract RH: and PN: lines, create <fw> elements, return cleaned text.

        Also detects and extracts UNTAGGED running headers and page numbers
        that the VLM outputs as plain text (common artifact).
        """
        # 1. Extract explicitly tagged forme work
        for m in re.finditer(r'^RH:\s*(.+)$', text, re.MULTILINE):
            make_element("fw", text=m.group(1).strip(),
                         attrib={"type": "header"}, parent=parent)
        for m in re.finditer(r'^PN:\s*(.+)$', text, re.MULTILINE):
            make_element("fw", text=m.group(1).strip(),
                         attrib={"type": "pageNum"}, parent=parent)
        for m in re.finditer(r'CW\{(.*?)\}', text):
            make_element("fw", text=m.group(1).strip(),
                         attrib={"type": "catch"}, parent=parent)

        text = re.sub(r'^RH:.*$', '', text, flags=re.MULTILINE)
        text = re.sub(r'^PN:.*$', '', text, flags=re.MULTILINE)
        text = re.sub(r'CW\{.*?\}', '', text)

        # 2. Detect untagged running headers / page numbers at the start of text.
        #    These appear as a short first line (often "Title 123" or "123 TITLE"
        #    or just "The Title Of The Work") separated from body by a blank line.
        #    We detect them by checking:
        #    (a) The first non-empty line is short (< 80 chars)
        #    (b) It's followed by a blank line (it's standalone, not part of a paragraph)
        #    (c) It doesn't start with ## (that's a heading)
        #    (d) It doesn't look like body text starting mid-sentence (lowercase start)
        text = text.strip()
        lines = text.split('\n')

        # Try to detect and extract up to 2 lines of header/page-number at the top
        extracted = 0
        while extracted < 2 and lines:
            first = lines[0].strip()

            # Skip empty lines
            if not first:
                lines.pop(0)
                continue

            # Don't touch headings
            if first.startswith('##'):
                break

            # Don't touch text that looks like it continues a paragraph
            # (starts with lowercase or a dash/em-dash)
            if first[0].islower() or first[0] in '—–-(':
                break

            # Is this a short standalone line followed by blank or end?
            is_short = len(first) < 80
            followed_by_blank = (len(lines) < 2 or lines[1].strip() == '')

            if not is_short or not followed_by_blank:
                break

            # Check if it looks like a page number (just digits, maybe with spaces)
            stripped_digits = re.sub(r'\s+', '', first)
            if stripped_digits.isdigit():
                make_element("fw", text=first,
                             attrib={"type": "pageNum"}, parent=parent)
                lines.pop(0)
                extracted += 1
                continue

            # Check if it's a combined header+page: "Title 123" or "123 Title"
            num_match = re.match(r'^(\d+)\s+(.+)$', first) or re.match(r'^(.+?)\s+(\d+)$', first)
            if num_match:
                g1, g2 = num_match.group(1), num_match.group(2)
                if g1.isdigit():
                    make_element("fw", text=g1, attrib={"type": "pageNum"}, parent=parent)
                    make_element("fw", text=g2, attrib={"type": "header"}, parent=parent)
                else:
                    make_element("fw", text=g1, attrib={"type": "header"}, parent=parent)
                    make_element("fw", text=g2, attrib={"type": "pageNum"}, parent=parent)
                lines.pop(0)
                extracted += 1
                continue

            # Short standalone title-case line (no number) — likely a running header
            # Must be title-case or all-caps and short
            words = first.split()
            is_titleish = (sum(1 for w in words if w[0].isupper()) >= len(words) * 0.5
                           if words else False)
            if is_titleish and len(first) < 60:
                make_element("fw", text=first,
                             attrib={"type": "header"}, parent=parent)
                lines.pop(0)
                extracted += 1
                continue

            # Doesn't match any header pattern — it's body text
            break

        text = '\n'.join(lines)
        return text

    def _parse_figures(self, text):
        """Extract FIG{...} markers, return (cleaned_text, list_of_descriptions)."""
        figures = [m.group(1) for m in re.finditer(r'FIG\{(.*?)\}', text)]
        text = re.sub(r'FIG\{.*?\}', '', text)
        return text, figures

    # ── Prose ────────────────────────────────────────────────────────

    def _build_prose_body(self, body_el, pages, source_filename):
        div = make_element("div", attrib={"type": "text"}, parent=body_el)
        for text, label in pages:
            self._add_page_break(div, label, source_filename)
            self._parse_prose_page(div, text)

    def _parse_prose_page(self, parent, text):
        """Parse a prose page into TEI elements."""
        # Extract forme work (RH, PN, CW) and get cleaned text
        text = self._extract_forme_work(parent, text)

        # Extract figures
        text, figures = self._parse_figures(text)
        for desc in figures:
            fig = make_element("figure", parent=parent)
            if desc:
                make_element("p", text=desc, parent=fig)

        # Extract footnotes (inline FN{...} → set aside)
        footnotes = [m.group(1) for m in re.finditer(r'FN\{(.*?)\}', text)]
        text = re.sub(r'FN\{.*?\}', '', text)

        # Split into headings and paragraph blocks by ## ... ##
        # Use re.DOTALL so ## ... ## can span newlines (VLM sometimes puts
        # multi-line headings like "## Chapter One\nTHE TITLE ##")
        parts = re.split(r'(##.*?##)', text, flags=re.DOTALL)

        for part in parts:
            part = part.strip()
            if not part:
                continue

            # Is this a heading?
            heading_match = re.match(r'^##\s*(.*?)\s*##$', part, re.DOTALL)
            if heading_match:
                # Normalize internal whitespace/newlines in heading
                heading_text = re.sub(r'\s+', ' ', heading_match.group(1)).strip()
                make_element("head", parent=parent).text = heading_text
                continue

            # Split into paragraphs by blank lines
            paragraphs = re.split(r'\n\s*\n', part)
            for para_text in paragraphs:
                para_text = para_text.strip()
                if not para_text:
                    continue
                # Merge multiple lines within a paragraph into one
                para_text = re.sub(r'\n\s*', ' ', para_text)
                p = make_element("p", parent=parent)
                self._apply_inline_markup(p, para_text)

        # Add footnotes
        for fn_text in footnotes:
            note = make_element("note", attrib={"place": "foot"}, parent=parent)
            self._apply_inline_markup(note, fn_text)

    # ── Poetry ───────────────────────────────────────────────────────

    def _build_poetry_body(self, body_el, pages, source_filename):
        div = make_element("div", attrib={"type": "poem"}, parent=body_el)
        for text, label in pages:
            self._add_page_break(div, label, source_filename)
            text = self._extract_forme_work(div, text)
            text, _ = self._parse_figures(text)
            self._parse_poetry_content(div, text)

    def _parse_poetry_content(self, parent, text):
        # Split stanzas by blank lines
        stanzas = re.split(r'\n\s*\n', text)
        for stanza_text in stanzas:
            stanza_text = stanza_text.strip()
            if not stanza_text:
                continue

            # Check if this is a heading
            heading_match = re.match(r'^##\s*(.*?)\s*##$', stanza_text)
            if heading_match:
                make_element("head", parent=parent).text = heading_match.group(1)
                continue

            lg = make_element("lg", attrib={"type": "stanza"}, parent=parent)
            for line in stanza_text.split('\n'):
                line = line.strip()
                if not line:
                    continue
                attrib = {}
                if line.startswith('>>'):
                    line = line[2:].strip()
                    attrib["rend"] = "indent"
                l_el = make_element("l", attrib=attrib, parent=lg)
                self._apply_inline_markup(l_el, line)

    # ── Drama ────────────────────────────────────────────────────────

    def _build_drama_body(self, body_el, pages, source_filename):
        div = make_element("div", attrib={"type": "text"}, parent=body_el)
        for text, label in pages:
            self._add_page_break(div, label, source_filename)
            text = self._extract_forme_work(div, text)
            text, _ = self._parse_figures(text)
            self._parse_drama_content(div, text)

    def _parse_drama_content(self, parent, text):
        # Process sequentially: headings, speakers, stage directions, paragraphs
        remaining = text
        while remaining.strip():
            # Heading
            m = re.match(r'\s*##\s*(.*?)\s*##\s*', remaining)
            if m:
                make_element("head", parent=parent).text = m.group(1)
                remaining = remaining[m.end():]
                continue

            # Stage direction: SD{...}
            m = re.match(r'\s*SD\{(.*?)\}\s*', remaining)
            if m:
                make_element("stage", text=m.group(1).strip(), parent=parent)
                remaining = remaining[m.end():]
                continue

            # Speaker + speech: SP{name} followed by text until next SP/SD/##
            m = re.match(r'\s*SP\{(.*?)\}\s*', remaining)
            if m:
                sp = make_element("sp", parent=parent)
                make_element("speaker", text=m.group(1).strip(), parent=sp)
                remaining = remaining[m.end():]
                # Collect speech text until next marker
                speech_end = re.search(r'(?=SP\{|SD\{|##)', remaining)
                if speech_end:
                    speech_text = remaining[:speech_end.start()].strip()
                    remaining = remaining[speech_end.start():]
                else:
                    speech_text = remaining.strip()
                    remaining = ''
                if speech_text:
                    p = make_element("p", parent=sp)
                    self._apply_inline_markup(p, speech_text)
                continue

            # Plain paragraph text
            next_marker = re.search(r'(?=SP\{|SD\{|##)', remaining)
            if next_marker and next_marker.start() > 0:
                chunk = remaining[:next_marker.start()].strip()
                remaining = remaining[next_marker.start():]
            else:
                chunk = remaining.strip()
                remaining = ''
            if chunk:
                for para in re.split(r'\n\s*\n', chunk):
                    para = para.strip()
                    if para:
                        p = make_element("p", parent=parent)
                        self._apply_inline_markup(p, para)

    # ── Manuscript ───────────────────────────────────────────────────

    def _build_manuscript_body(self, body_el, pages, source_filename):
        div = make_element("div", attrib={"type": "letter"}, parent=body_el)
        for text, label in pages:
            self._add_page_break(div, label, source_filename)
            text = self._extract_forme_work(div, text)
            text, _ = self._parse_figures(text)
            self._parse_manuscript_content(div, text)

    def _parse_manuscript_content(self, parent, text):
        # Extract opener elements
        has_opener = 'DATE{' in text or 'SAL{' in text
        if has_opener:
            opener = make_element("opener", parent=parent)
            for m in re.finditer(r'DATE\{(.*?)\}', text):
                make_element("dateline", text=m.group(1).strip(), parent=opener)
            for m in re.finditer(r'SAL\{(.*?)\}', text):
                make_element("salute", text=m.group(1).strip(), parent=opener)
            text = re.sub(r'DATE\{.*?\}', '', text)
            text = re.sub(r'SAL\{.*?\}', '', text)

        # Extract closer elements
        closer_texts = [m.group(1) for m in re.finditer(r'CLOSE\{(.*?)\}', text)]
        signed_texts = [m.group(1) for m in re.finditer(r'SIGN\{(.*?)\}', text)]
        text = re.sub(r'CLOSE\{.*?\}', '', text)
        text = re.sub(r'SIGN\{.*?\}', '', text)

        # Body
        self._parse_prose_page(parent, text)

        # Closer
        if closer_texts or signed_texts:
            closer = make_element("closer", parent=parent)
            for t in closer_texts:
                make_element("salute", text=t.strip(), parent=closer)
            for t in signed_texts:
                make_element("signed", text=t.strip(), parent=closer)

    # ── Inline Markup ────────────────────────────────────────────────

    def _apply_inline_markup(self, element, text):
        """Apply inline TEI markup.

        Handles the simplified marker format:
          _italic_   **bold**   SC{smallcaps}   BL{blackletter}
          DI{X}   DEL{text}   ADD{text}   UNC{text}   [illegible]
        Also handles custom tags: NAME{text}
        """
        if not text:
            return

        # Pattern matches all inline markers
        pattern = re.compile(
            r'_(.*?)_(?!\w)|'           # Group 1: italic (not mid-word)
            r'\*\*(.*?)\*\*|'           # Group 2: bold
            r'SC\{(.*?)\}|'            # Group 3: small caps
            r'BL\{(.*?)\}|'            # Group 4: blackletter
            r'DI\{(\w)\}|'             # Group 5: decorative initial
            r'DEL\{(.*?)\}|'           # Group 6: deletion
            r'ADD\{(.*?)\}|'           # Group 7: addition
            r'UNC\{(.*?)\}|'           # Group 8: unclear
            r'\[illegible\]|'          # Group 9 (no capture): illegible
            r'MN\{(.*?)\}|'            # Group 9: margin note
            r'([A-Z][A-Z0-9_]+)\{(.*?)\}'  # Groups 10,11: custom/unknown tag
        )

        last_end = 0
        for match in pattern.finditer(text):
            # Add text before this match
            before = text[last_end:match.start()]
            self._append_text(element, before)

            if match.group(1) is not None:  # italic
                hi = make_element("hi", attrib={"rend": "italic"}, parent=element)
                hi.text = match.group(1)
            elif match.group(2) is not None:  # bold
                hi = make_element("hi", attrib={"rend": "bold"}, parent=element)
                hi.text = match.group(2)
            elif match.group(3) is not None:  # small caps
                hi = make_element("hi", attrib={"rend": "smallcaps"}, parent=element)
                hi.text = match.group(3)
            elif match.group(4) is not None:  # blackletter
                hi = make_element("hi", attrib={"rend": "blackletterType"}, parent=element)
                hi.text = match.group(4)
            elif match.group(5) is not None:  # decorative initial
                seg = make_element("seg", attrib={"rend": "decorInit"}, parent=element)
                seg.text = match.group(5)
            elif match.group(6) is not None:  # deletion
                del_el = make_element("del", parent=element)
                del_el.text = match.group(6)
            elif match.group(7) is not None:  # addition
                add_el = make_element("add", parent=element)
                add_el.text = match.group(7)
            elif match.group(8) is not None:  # unclear
                unc = make_element("unclear", parent=element)
                unc.text = match.group(8)
            elif match.group(0) == '[illegible]':  # illegible
                make_element("gap", attrib={"reason": "illegible"}, parent=element)
            elif match.group(9) is not None:  # margin note
                note = make_element("note", attrib={"place": "margin"}, parent=element)
                note.text = match.group(9)
            elif match.group(10) is not None:  # custom tag
                tag_name = match.group(10).lower()
                tag_text = match.group(11)
                seg = make_element("seg", attrib={"type": tag_name}, parent=element)
                seg.text = tag_text
            last_end = match.end()

        # Remaining text after last match
        after = text[last_end:]
        self._append_text(element, after)

    def _append_text(self, element, text):
        """Append text to an element, respecting existing children."""
        if not text:
            return
        children = list(element)
        if children:
            children[-1].tail = (children[-1].tail or "") + text
        else:
            element.text = (element.text or "") + text
