r360mx-docs-converter/apply_template.py

#!/usr/bin/env python3
"""
apply_template.py v2 - Conversión de ENERGY REPORT a formato R360MX.

Correcciones respecto a v1:
- Las imágenes del template NUNCA se sobrescriben (se parte del template y
  las imágenes del source se renombran con numeración que evita colisiones).
- Los estilos de párrafo del source (p.ej. "Title1") se mapean a los estilos
  del template ("Título 1").
"""

import sys, os, zipfile, re, copy
from lxml import etree

w  = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
r  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
a  = 'http://schemas.openxmlformats.org/drawingml/2006/main'

# ======================================================================
# Mapeo de estilos: source -> template
# ======================================================================
STYLE_MAP = {
    'Title1': 'Título 1',
    'Title2Index': 'Title2Index',  # mantener igual
    'TableContentEnd': 'TableContentEnd',
}


def parse_xml(content):
    return etree.fromstring(content)


def get_image_number(filename):
    m = re.search(r'image(\d+)\.', filename)
    return int(m.group(1)) if m else 0


def find_max_image_id(z):
    max_id = 0
    for name in z.namelist():
        n = get_image_number(name)
        if n > max_id:
            max_id = n
    return max_id


def find_content_start(children):
    """Encuentra primer título de contenido real (después del índice)."""
    found_toc = False
    for i, child in enumerate(children):
        if child.tag == f'{{{w}}}p':
            style = child.find(f'.//{{{w}}}pStyle')
            sval = style.get(f'{{{w}}}val') if style is not None else ''
            texts = child.findall(f'.//{{{w}}}t')
            text = ''.join(t.text or '' for t in texts)
            if sval == 'Title2Index':
                found_toc = True
                continue
            if found_toc and sval == 'Title1' and text and (text[0].isdigit() or text[0] in 'IVX'):
                if '. ' in text[:6] or text[-1].isdigit():
                    return i
    for i, child in enumerate(children):
        if child.tag == f'{{{w}}}p':
            style = child.find(f'.//{{{w}}}pStyle')
            sval = style.get(f'{{{w}}}val') if style is not None else ''
            texts = child.findall(f'.//{{{w}}}t')
            text = ''.join(t.text or '' for t in texts)
            if sval == 'Title1' and text and text[0].isdigit() and '. ' in text[:6]:
                return i
    return 69


def remap_styles(xml_root, style_map):
    """
    Recorre el XML y cambia los párrafos que usan estilos del source
    a los correspondientes estilos del template.
    """
    changes = 0
    for p in xml_root.iter(f'{{{w}}}p'):
        pPr = p.find(f'{{{w}}}pPr')
        if pPr is None:
            continue
        pStyle = pPr.find(f'{{{w}}}pStyle')
        if pStyle is None:
            continue
        old_val = pStyle.get(f'{{{w}}}val')
        if old_val in style_map:
            new_val = style_map[old_val]
            if new_val:
                pStyle.set(f'{{{w}}}val', new_val)
                changes += 1
    return changes


def collect_image_refs(xml_root):
    """Encuentra todos los a:blip con r:embed."""
    blips = []
    for blip in xml_root.iter(f'{{{a}}}blip'):
        rid = blip.get(f'{{{r}}}embed')
        if rid:
            blips.append((blip, rid))
    return blips


def replace_content(template_path, source_docx_path, output_path):
    z_tmpl = zipfile.ZipFile(template_path, 'r')
    z_src  = zipfile.ZipFile(source_docx_path, 'r')

    # ---- Leer XMLs ----
    tmpl_xml = parse_xml(z_tmpl.read('word/document.xml'))
    src_xml  = parse_xml(z_src.read('word/document.xml'))
    tmpl_rel = parse_xml(z_tmpl.read('word/_rels/document.xml.rels'))
    src_rel  = parse_xml(z_src.read('word/_rels/document.xml.rels'))

    body_tmpl = tmpl_xml.find(f'{{{w}}}body')
    body_src  = src_xml.find(f'{{{w}}}body')
    children_tmpl = list(body_tmpl)
    children_src  = list(body_src)

    # ---- Remapear estilos en el XML del source (antes de fusionar) ----
    style_changes = remap_styles(src_xml, STYLE_MAP)
    print(f"  Estilos reasignados: {style_changes}")

    # ---- Detectar límites ----
    tmpl_idx_end = 36
    for i, child in enumerate(children_tmpl):
        if child.tag == f'{{{w}}}p':
            style = child.find(f'.//{{{w}}}pStyle')
            sval = style.get(f'{{{w}}}val') if style is not None else ''
            if sval == 'TableContentEnd':
                tmpl_idx_end = i
            elif sval == 'Título 1' and i > tmpl_idx_end:
                break
    if tmpl_idx_end < 10:
        tmpl_idx_end = 36

    tmpl_back = 47
    for i, child in enumerate(children_tmpl):
        if child.tag == f'{{{w}}}p':
            texts = child.findall(f'.//{{{w}}}t')
            if 'RENOVABLES 360' in ''.join(t.text or '' for t in texts):
                tmpl_back = i
                break

    src_start = find_content_start(children_src)
    print(f"  Template: índice h. {tmpl_idx_end}, contraportada h. {tmpl_back}")
    print(f"  Source: contenido real empieza en hijo {src_start}")

    # ---- Renombrar imágenes del source ----
    max_tmpl_img = find_max_image_id(z_tmpl)
    print(f"  Max imagen en template: {max_tmpl_img}")

    # Imágenes que ya existen en el template (no se tocan)
    existing_tmpl_media = set()
    for name in z_tmpl.namelist():
        if name.startswith('word/media/'):
            existing_tmpl_media.add(name)

    image_rename_map = {}  # old_abs -> new_abs
    rid_rename_map = {}    # old_rId -> new_rId

    # Identificar rIds de imágenes en el source
    src_rids = {}
    for rel in src_rel:
        rid = rel.get('Id')
        target = rel.get('Target', '').replace('\\', '/')
        rel_type = rel.get('Type', '')
        if 'image' in rel_type:
            src_rids[rid] = target

    # Generar nuevos nombres SIN colisionar con template
    generated = set()
    src_items = []
    for old_rid, rel_target in src_rids.items():
        rel_path = rel_target.replace('../', '')
        old_abs = f'word/{rel_path}' if not rel_path.startswith('word/') else rel_path
        old_num = get_image_number(old_abs)
        src_items.append((old_num, old_rid, old_abs))
    src_items.sort()

    for old_num, old_rid, old_abs in src_items:
        ext = old_abs.rsplit('.', 1)[1]
        candidate = max_tmpl_img + old_num
        new_abs = f'word/media/image{candidate}.{ext}'
        while new_abs in existing_tmpl_media or new_abs in generated:
            candidate += 1
            new_abs = f'word/media/image{candidate}.{ext}'
        image_rename_map[old_abs] = new_abs
        generated.add(new_abs)
        rid_rename_map[old_rid] = f'rId{candidate}'
        print(f"    {old_abs} -> {new_abs}")

    # ---- Fusionar bodies ----
    # Vaciar template y reconstruir:
    #   [portada+disclaimer+índice del template]
    #   + [contenido del source (desde src_start, con estilos remapeados)]
    #   + [contraportada del template]
    for child in list(body_tmpl):
        body_tmpl.remove(child)

    for child in children_tmpl[:tmpl_idx_end + 1]:
        body_tmpl.append(copy.deepcopy(child))

    for child in children_src[src_start:]:
        if child.tag != f'{{{w}}}sectPr':
            body_tmpl.append(copy.deepcopy(child))

    for child in children_tmpl[tmpl_back:]:
        body_tmpl.append(copy.deepcopy(child))

    # ---- Actualizar rIds en document.xml ----
    for blip, old_rid in collect_image_refs(tmpl_xml):
        if old_rid in rid_rename_map:
            blip.set(f'{{{r}}}embed', rid_rename_map[old_rid])

    # ---- Construir zip de salida ----
    out_data = {}

    # 1. PARTIR DEL TEMPLATE (sus imágenes NUNCA se tocan)
    for item in z_tmpl.infolist():
        out_data[item.filename] = z_tmpl.read(item.filename)

    # 2. Añadir imágenes del source renombradas
    for old_abs, new_abs in image_rename_map.items():
        content = z_src.read(old_abs)
        out_data[new_abs] = content

    # 3. Añadir relaciones de imágenes del source
    rel_root = parse_xml(out_data['word/_rels/document.xml.rels'])
    existing_rids = set()
    for rel in rel_root:
        rid = rel.get('Id')
        if rid:
            existing_rids.add(rid)

    for old_rid, new_rid in rid_rename_map.items():
        if new_rid in existing_rids:
            continue
        for rel in src_rel:
            if rel.get('Id') == old_rid:
                target = rel.get('Target', '')
                old_target_abs = target.replace('../', '')
                if not old_target_abs.startswith('word/'):
                    old_target_abs = f'word/{old_target_abs}'
                new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
                new_target = new_target_abs[5:] if new_target_abs.startswith('word/') else new_target_abs
                new_rel = copy.deepcopy(rel)
                new_rel.set('Id', new_rid)
                new_rel.set('Target', new_target)
                rel_root.append(new_rel)
                existing_rids.add(new_rid)
                break

    out_data['word/_rels/document.xml.rels'] = etree.tostring(
        rel_root, xml_declaration=True, encoding='UTF-8', standalone=True)
    out_data['word/document.xml'] = etree.tostring(
        tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True)

    # ---- Escribir ----
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zout:
        for fname, content in out_data.items():
            zout.writestr(fname, content)

    z_tmpl.close()
    z_src.close()
    print(f"  ✅ Convertido: {output_path}")
    return output_path


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Uso: apply_template.py <documento.docx> <plantilla.docx>")
        sys.exit(1)

    docx_path = sys.argv[1]
    template_path = sys.argv[2]
    base_dir = os.path.dirname(os.path.abspath(docx_path))
    base_name = os.path.splitext(os.path.basename(docx_path))[0]
    output_path = os.path.join(base_dir, f"{base_name}_r360mx.docx")

    print(f"📄 Template: {template_path}")
    print(f"📄 Documento: {docx_path}")
    print(f"📄 Salida:    {output_path}")
    print()

    replace_content(template_path, docx_path, output_path)