"""
TEI P5 Schema for OCR Output.

Header strategy (TEI P5 compliant):
- <fileDesc> is required and must contain <titleStmt>, <publicationStmt>, <sourceDesc>
- We keep <titleStmt> and <publicationStmt> as minimal stubs
  (title echoed, publicationStmt just says "unpublished — generated by TEI Pipeline")
- All real bibliographic metadata about the SOURCE goes into <sourceDesc><bibl>
- Language goes into <profileDesc><langUsage>
"""

from lxml import etree
from datetime import datetime

TEI_NS = "http://www.tei-c.org/ns/1.0"
XML_NS = "http://www.w3.org/XML/1998/namespace"
NSMAP = {None: TEI_NS, "xml": XML_NS}


def make_element(tag, text=None, attrib=None, parent=None):
    el = (etree.SubElement(parent, f"{{{TEI_NS}}}{tag}")
          if parent is not None
          else etree.Element(f"{{{TEI_NS}}}{tag}", nsmap=NSMAP))
    if text:
        el.text = text
    if attrib:
        for k, v in attrib.items():
            if k.startswith("xml:"):
                el.set(f"{{{XML_NS}}}{k[4:]}", v)
            else:
                el.set(k, v)
    return el


def create_tei_skeleton(metadata=None):
    if metadata is None:
        metadata = {}

    tei = etree.Element(f"{{{TEI_NS}}}TEI", nsmap=NSMAP)
    header = make_element("teiHeader", parent=tei)
    file_desc = make_element("fileDesc", parent=header)

    # ── titleStmt (required, minimal) ────────────────────────────────
    title_stmt = make_element("titleStmt", parent=file_desc)
    title = metadata.get("title", "[Untitled]")
    make_element("title", text=title, parent=title_stmt)

    # ── publicationStmt (required, minimal stub) ─────────────────────
    pub_stmt = make_element("publicationStmt", parent=file_desc)
    make_element("p", text="Unpublished — generated by TEI Pipeline.", parent=pub_stmt)

    # ── sourceDesc (the real metadata lives here) ────────────────────
    source_desc = make_element("sourceDesc", parent=file_desc)
    bibl = make_element("bibl", parent=source_desc)

    # Populate <bibl> with whatever we have
    make_element("title", text=title, parent=bibl)
    if metadata.get("author"):
        make_element("author", text=metadata["author"], parent=bibl)
    if metadata.get("date"):
        make_element("date", text=str(metadata["date"]),
                     attrib={"when": str(metadata["date"])}, parent=bibl)
    if metadata.get("publisher"):
        make_element("publisher", text=metadata["publisher"], parent=bibl)
    if metadata.get("pubPlace"):
        make_element("pubPlace", text=metadata["pubPlace"], parent=bibl)

    # ── profileDesc (language) ───────────────────────────────────────
    lang = metadata.get("language", "en")
    profile_desc = make_element("profileDesc", parent=header)
    lang_usage = make_element("langUsage", parent=profile_desc)
    make_element("language", text=lang, attrib={"ident": lang}, parent=lang_usage)

    # ── text ─────────────────────────────────────────────────────────
    text_el = make_element("text", parent=tei)
    if lang and lang != "en":
        text_el.set(f"{{{XML_NS}}}lang", lang)

    return tei


def serialize_tei(tei_root):
    xml_decl = '<?xml version="1.0" encoding="UTF-8"?>\n'
    body = etree.tostring(tei_root, pretty_print=True,
                          encoding="unicode", xml_declaration=False)
    return xml_decl + body


def validate_tei_structure(xml_string):
    errors = []
    try:
        root = etree.fromstring(
            xml_string.encode("utf-8") if isinstance(xml_string, str) else xml_string)
    except etree.XMLSyntaxError as e:
        return False, [f"XML syntax error: {e}"]

    if not root.tag.endswith("}TEI") and root.tag != "TEI":
        errors.append(f"Root element should be TEI, got {root.tag}")

    header = root.find(f"{{{TEI_NS}}}teiHeader")
    if header is None:
        errors.append("Missing teiHeader element")
    else:
        fd = header.find(f"{{{TEI_NS}}}fileDesc")
        if fd is None:
            errors.append("Missing fileDesc in teiHeader")
        else:
            for sub in ["titleStmt", "publicationStmt", "sourceDesc"]:
                if fd.find(f"{{{TEI_NS}}}{sub}") is None:
                    errors.append(f"Missing {sub} in fileDesc")

    if root.find(f"{{{TEI_NS}}}text") is None:
        errors.append("Missing text element")

    return len(errors) == 0, errors


def repair_tei_xml(xml_string):
    import re
    if 'xmlns="http://www.tei-c.org/ns/1.0"' not in xml_string:
        xml_string = xml_string.replace("<TEI>",
            '<TEI xmlns="http://www.tei-c.org/ns/1.0">')
    for tag in ["pb", "lb", "cb", "milestone", "gap"]:
        pattern = rf'<{tag}([^/]*?)(?<!/)>'
        xml_string = re.sub(pattern, rf'<{tag}\1/>', xml_string)
    xml_string = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;|#)', '&amp;', xml_string)
    if not xml_string.strip().startswith("<?xml"):
        xml_string = '<?xml version="1.0" encoding="UTF-8"?>\n' + xml_string
    return xml_string
