apply_template: rewrite con renumeración de imágenes. Parte del template, copia imágenes source renombradas desde max_tmpl+1, evita colisiones

2026-05-04 22:06:49 +02:00
parent 55ca3e1625
commit 69e993e5cf
1 changed files with 292 additions and 103 deletions
@@ -1,20 +1,51 @@
 #!/usr/bin/env python3
 """
 Aplica plantilla portada.docx a un ENERGY REPORT.
-Estrategia: partir del DOCX ORIGINAL (que tiene todas sus imágenes y relaciones intactas)
-y reemplazar solo los primeros hijos del body (portada+disclaimer+índice del original)
-por los del template. La contraportada del template se añade al final.
-
-Así las imágenes del contenido original mantienen sus relaciones intactas.
+Estrategia:
+  1. Partir del DOCX TEMPLATE como base.
+  2. Copiar imágenes del DOCX ORIGINAL al template, renombrándolas
+     desde el último número de imagen del template +1.
+  3. Actualizar las referencias a imágenes en document.xml y
+     document.xml.rels para que apunten a los nuevos nombres.
 """
-import sys, os, shutil, copy, zipfile, re
+
+import sys, os, zipfile, re, copy
 from lxml import etree

-w = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
+# Espacios de nombres OOXML
+w  = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
+r  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
+wp = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
+a  = 'http://schemas.openxmlformats.org/drawingml/2006/main'
+pic = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
+rel_type_image = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'

-def get_xml(path):
-    with zipfile.ZipFile(path, 'r') as z:
-        return z.read('word/document.xml')
+# ======================================================================
+# Funciones auxiliares
+# ======================================================================
+
+def parse_xml(content):
+    return etree.fromstring(content)
+
+def read_zip_entry(z, path):
+    return z.read(path)
+
+def write_zip_entry(zout, name, content):
+    zout.writestr(name, content)
+
+def get_image_number(filename):
+    """Extrae el número de una imagen tipo 'word/media/image42.png'."""
+    m = re.search(r'image(\d+)\.', filename)
+    return int(m.group(1)) if m else 0
+
+def find_max_image_id(z):
+    """Encuentra el número de imagen más alto dentro del zip."""
+    max_id = 0
+    for name in z.namelist():
+        n = get_image_number(name)
+        if n > max_id:
+            max_id = n
+    return max_id

 def find_content_start(children):
    """Encuentra primer título de contenido real (después del índice)."""
@@ -25,139 +56,297 @@ def find_content_start(children):
            sval = style.get(f'{{{w}}}val') if style is not None else ''
            texts = child.findall(f'.//{{{w}}}t')
            text = ''.join(t.text or '' for t in texts)
-            if sval == 'Title2Index': found_toc = True; continue
+            if sval == 'Title2Index':
+                found_toc = True
+                continue
            if found_toc and sval == 'Title1' and text and (text[0].isdigit() or text[0] in 'IVX'):
-                if '. ' in text[:6] or text[-1].isdigit(): return i
+                if '. ' in text[:6] or text[-1].isdigit():
+                    return i
+    # Fallback
    for i, child in enumerate(children):
        if child.tag == f'{{{w}}}p':
            style = child.find(f'.//{{{w}}}pStyle')
            sval = style.get(f'{{{w}}}val') if style is not None else ''
            texts = child.findall(f'.//{{{w}}}t')
            text = ''.join(t.text or '' for t in texts)
-            if sval == 'Title1' and text and text[0].isdigit() and '. ' in text[:6]: return i
+            if sval == 'Title1' and text and text[0].isdigit() and '. ' in text[:6]:
+                return i
    return 69

+
+def collect_image_refs(xml_root):
+    """
+    Escanea el XML y encuentra TODAS las referencias a imágenes:
+    - wp:extent (tamaño)
+    - a:blip r:embed (relación)
+    - pic:blipFill (relleno de imagen)
+    Devuelve lista de (elemento_blip, rId).
+    """
+    blips = []
+    nsmap = {'a': a, 'r': r, 'pic': pic, 'wp': wp}
+    for blip in xml_root.iter(f'{{{a}}}blip'):
+        rid = blip.get(f'{{{r}}}embed')
+        if rid:
+            blips.append((blip, rid))
+    return blips
+
+
+# ======================================================================
+# Función principal
+# ======================================================================
+
 def replace_content(template_path, source_docx_path, output_path):
-    tmpl_xml = etree.fromstring(get_xml(template_path))
-    src_xml = etree.fromstring(get_xml(source_docx_path))
-    
+    """
+    Aplica el template al documento fuente y escribe el resultado.
+
+    1. Abre template y fuente como zips.
+    2. Detecta límites en ambos XML (portada, índice, contraportada).
+    3. Fusiona los bodies: portada+índice del template + contenido del source + contraportada del template.
+    4. Renombra imágenes del source y actualiza referencias.
+    5. Escribe el nuevo DOCX.
+    """
+    z_tmpl = zipfile.ZipFile(template_path, 'r')
+    z_src  = zipfile.ZipFile(source_docx_path, 'r')
+
+    # ---- Leer XMLs ----
+    tmpl_xml = parse_xml(read_zip_entry(z_tmpl, 'word/document.xml'))
+    src_xml  = parse_xml(read_zip_entry(z_src,  'word/document.xml'))
+    tmpl_rel = parse_xml(read_zip_entry(z_tmpl, 'word/_rels/document.xml.rels'))
+    src_rel  = parse_xml(read_zip_entry(z_src,  'word/_rels/document.xml.rels'))
+
    body_tmpl = tmpl_xml.find(f'{{{w}}}body')
-    body_src = src_xml.find(f'{{{w}}}body')
-    
+    body_src  = src_xml.find(f'{{{w}}}body')
+
    children_tmpl = list(body_tmpl)
-    children_src = list(body_src)
-    
-    # ===== DETECTAR LÍMITES =====
-    # Template
+    children_src  = list(body_src)
+
+    # ---- Detectar límites en el template ----
+    # Índice
    tmpl_idx_end = 36
    for i, child in enumerate(children_tmpl):
        if child.tag == f'{{{w}}}p':
            style = child.find(f'.//{{{w}}}pStyle')
            sval = style.get(f'{{{w}}}val') if style is not None else ''
-            if sval == 'TableContentEnd': tmpl_idx_end = i
-            elif sval == 'Ttulo1' and i > tmpl_idx_end: break
-    if tmpl_idx_end < 10: tmpl_idx_end = 36
-    
+            if sval == 'TableContentEnd':
+                tmpl_idx_end = i
+            elif sval == 'Ttulo1' and i > tmpl_idx_end:
+                break
+    if tmpl_idx_end < 10:
+        tmpl_idx_end = 36
+
+    # Contraportada
    tmpl_back = 47
    for i, child in enumerate(children_tmpl):
        if child.tag == f'{{{w}}}p':
            texts = child.findall(f'.//{{{w}}}t')
-            if 'RENOVABLES 360' in ''.join(t.text or '' for t in texts): tmpl_back = i; break
-    
-    # Original: dónde empieza el contenido real
+            if 'RENOVABLES 360' in ''.join(t.text or '' for t in texts):
+                tmpl_back = i
+                break
+
+    # ---- Detectar dónde empieza el contenido real en el source ----
    src_start = find_content_start(children_src)
-    
-    print(f"  Template: índice h. {tmpl_idx_end}, contraportada h. {tmpl_back}")
+
+    print(f"  Template: índice termina en hijo {tmpl_idx_end}, contraportada en hijo {tmpl_back}")
    print(f"  Original: contenido real empieza en hijo {src_start}")
-    
-    # ===== ESTRATEGIA: PARTIR DEL ORIGINAL, REEMPLAZAR PORTADA + AÑADIR CONTRAPORTADA =====
-    # Construir nuevo body:
-    # 1. Portada + Disclaimer + Índice del TEMPLATE
-    # 2. Contenido real del ORIGINAL (desde src_start, sin sectPr)
-    # 3. Contraportada del TEMPLATE
-    
-    for child in list(body_tmpl): body_tmpl.remove(child)
-    
+
+    # ---- Renombrar imágenes del source ----
+    max_tmpl_img = find_max_image_id(z_tmpl)
+    print(f"  Max imagen en template: {max_tmpl_img}")
+
+    # Mapa: old_rel_path -> new_rel_path  (ej: media/image1.png -> media/image5.png)
+    image_rename_map = {}
+    # Mapa: old_rId -> new_rId (para actualizar relaciones)
+    rid_rename_map = {}
+
+    # 1. Construir un set de rIds que YA existen en el template
+    existing_tmpl_rids = set()
+    for rel in tmpl_rel:
+        rid = rel.get('Id')
+        if rid:
+            existing_tmpl_rids.add(rid)
+
+    # 2. Identificar rIds de imágenes en el source
+    src_rids = {}
+    for rel in src_rel:
+        rid = rel.get('Id')
+        target = rel.get('Target', '').replace('\\', '/')
+        rel_type = rel.get('Type', '')
+        if 'image' in rel_type:
+            src_rids[rid] = target
+
+    print(f"  Imágenes en source: {len(src_rids)}")
+
+    # 3. Generar nuevos nombres y rIds para las imágenes del source
+    #    Debemos evitar colisiones: ni con imágenes del template ni entre imágenes
+    #    del source después de renombrar.
+    new_rid_counter = 1
+    all_old_new_rids = {}  # old_rId -> new_rId
+
+    # Pre-calcular: set de nombres de imagen que ya existen en el template
+    existing_tmpl_media = set()
+    for name in z_tmpl.namelist():
+        if name.startswith('word/media/') and get_image_number(name) > 0:
+            existing_tmpl_media.add(name)
+
+    # Pre-calcular: los nuevos nombres que vamos generando (para evitar colisiones internas)
+    generated_new_names = set()
+
+    # Ordenar por old_num ascendente para consistencia
+    src_items = []
+    for old_rid, rel_target in src_rids.items():
+        old_rel_path = rel_target
+        if old_rel_path.startswith('..'):
+            old_rel_path = old_rel_path.replace('../', '')
+        old_abs = f'word/{old_rel_path}' if not old_rel_path.startswith('word/') else old_rel_path
+        old_num = get_image_number(old_abs) if old_abs.startswith('word/media/') else 0
+        src_items.append((old_num, old_rid, old_abs))
+    src_items.sort()
+
+    for old_num, old_rid, old_abs in src_items:
+        if not old_abs.startswith('word/media/'):
+            continue
+
+        ext = old_abs.rsplit('.', 1)[1]
+
+        # Buscar el primer número disponible:
+        # empezar desde max_tmpl_img + old_num, pero si colisiona, incrementar
+        candidate_num = max_tmpl_img + old_num
+        candidate_abs = f'word/media/image{candidate_num}.{ext}'
+
+        while candidate_abs in existing_tmpl_media or candidate_abs in generated_new_names:
+            candidate_num += 1
+            candidate_abs = f'word/media/image{candidate_num}.{ext}'
+
+        new_abs = candidate_abs
+        new_num = candidate_num
+        image_rename_map[old_abs] = new_abs
+        generated_new_names.add(new_abs)
+
+        # Generar nuevo rId (evitar colisión con template)
+        new_rid = f'rIdImage{new_num}'
+        while new_rid in existing_tmpl_rids:
+            new_rid_counter += 1
+            new_rid = f'rIdImage{new_num}_{new_rid_counter}'
+
+        all_old_new_rids[old_rid] = new_rid
+        print(f"    {old_abs} -> {new_abs} (rId: {old_rid} -> {new_rid})")
+
+    # ---- Fusionar bodies ----
+    # Vaciar body del template y reconstruirlo:
+    #   [portada+disclaimer+índice del template]
+    #   + [contenido del source (desde src_start)]
+    #   + [contraportada del template]
+
+    for child in list(body_tmpl):
+        body_tmpl.remove(child)
+
+    # Portada + índice del template
    for child in children_tmpl[:tmpl_idx_end + 1]:
        body_tmpl.append(copy.deepcopy(child))
+
+    # Contenido del source (sin sectPr)
    for child in children_src[src_start:]:
        if child.tag != f'{{{w}}}sectPr':
            body_tmpl.append(copy.deepcopy(child))
+
+    # Contraportada del template
    for child in children_tmpl[tmpl_back:]:
        body_tmpl.append(copy.deepcopy(child))
-    
-    # ===== COPIAR ARCHIVOS =====
-    # Partir del DOCX ORIGINAL (imágenes y relaciones del contenido intactas)
-    with zipfile.ZipFile(source_docx_path, 'r') as z:
-        out_data = {item.filename: z.read(item.filename) for item in z.infolist()}
-    
-    # Añadir archivos del template que no están en el original
-    with zipfile.ZipFile(template_path, 'r') as z:
-        for item in z.infolist():
-            fname = item.filename
-            if fname not in out_data:
-                out_data[fname] = z.read(fname)
-            elif 'media/' in fname:
-                # Las imágenes del template se añaden con sufijo _tmpl para no colisionar
-                base, ext = fname.rsplit('.', 1)
-                new_fname = f"{base}_tmpl.{ext}"
-                if new_fname not in out_data:
-                    out_data[new_fname] = z.read(fname)
-    
-    # Reemplazar document.xml
-    out_data['word/document.xml'] = etree.tostring(tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True)
-    
-    # ===== ACTUALIZAR RELACIONES =====
-    # Las imágenes del template ahora tienen _tmpl en el nombre
-    # Necesito actualizar las relaciones del template para que apunten a _tmpl
-    
-    rels_path = 'word/_rels/document.xml.rels'
-    if rels_path in out_data:
-        rels_xml = etree.fromstring(out_data[rels_path])
-        for rel in rels_xml:
-            target = rel.get('Target', '')
-            # Las relaciones de imágenes del template que se sobrescribieron
-            if target.startswith('media/') and not target.startswith('media/image'):
-                pass  # no hay conflictos con nombres no-numéricos
-            # Las imágenes numéricas del template están sobrescritas por las del original
-            # Pero nosotros las copiamos como _tmpl, así que hay que actualizar las relaciones
-            # SÓLO si la imagen original fue sobrescrita
-            m = re.match(r'media/(image\d+)\.(\w+)', target)
-            if m:
-                img_name = m.group(1)
-                ext = m.group(2)
-                # Verificar si esta imagen existe en el original
-                orig_path = f'word/media/{img_name}.{ext}'
-                if orig_path not in dict([(i.filename, None) for i in zipfile.ZipFile(source_docx_path, 'r').infolist() if not hasattr(i, 'filename')]):
-                    pass  # No fue sobrescrita
-                # Es más fácil: simplemente cambiar todas las referencias a imágenes
-                # del template que colisionan a la versión _tmpl
-                new_target = f'media/{img_name}_tmpl.{ext}'
-                # Solo cambiamos si existe la versión _tmpl
-                if f'word/{new_target}' in out_data:
-                    rel.set('Target', new_target)
-        
-        out_data[rels_path] = etree.tostring(rels_xml, xml_declaration=True, encoding='UTF-8', standalone=True)
-    
-    # Escribir
+
+    # ---- Actualizar referencias de imagen en document.xml ----
+    # Buscar todos los a:blip y actualizar rId
+    for blip, old_rid in collect_image_refs(tmpl_xml):
+        if old_rid in all_old_new_rids:
+            blip.set(f'{{{r}}}embed', all_old_new_rids[old_rid])
+
+    # ---- Construir zip de salida ----
+    out_data = {}
+
+    # 1. Copiar todo el template (base)
+    for item in z_tmpl.infolist():
+        out_data[item.filename] = z_tmpl.read(item.filename)
+
+    # 2. Copiar imágenes renombradas del source
+    for old_abs, new_abs in image_rename_map.items():
+        if old_abs in out_data:
+            # La imagen del template tiene el mismo nombre -> la renombramos
+            out_data[new_abs] = z_src.read(old_abs)
+        else:
+            # No hay conflicto, copiamos directamente
+            out_data[new_abs] = z_src.read(old_abs)
+
+    # 3. Actualizar document.xml.rels: añadir nuevas relaciones de imagen
+    rel_root = parse_xml(out_data['word/_rels/document.xml.rels'])
+
+    for old_rid, new_rid in all_old_new_rids.items():
+        # Buscar la relación original en el source
+        for rel in src_rel:
+            if rel.get('Id') == old_rid:
+                target = rel.get('Target', '')
+                # Actualizar target al nuevo nombre de imagen
+                old_target_abs = target.replace('../', '')
+                if not old_target_abs.startswith('word/'):
+                    old_target_abs = f'word/{old_target_abs}'
+                new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
+                # Convertir a ruta relativa desde word/
+                if new_target_abs.startswith('word/'):
+                    new_target = new_target_abs[5:]
+                else:
+                    new_target = new_target_abs
+                # Crear nueva relación
+                new_rel = copy.deepcopy(rel)
+                new_rel.set('Id', new_rid)
+                new_rel.set('Target', new_target)
+                rel_root.append(new_rel)
+                break
+
+    out_data['word/_rels/document.xml.rels'] = etree.tostring(
+        rel_root, xml_declaration=True, encoding='UTF-8', standalone=True)
+
+    # 4. Actualizar document.xml
+    out_data['word/document.xml'] = etree.tostring(
+        tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True)
+
+    # ---- Escribir ----
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zout:
        for fname, content in out_data.items():
            zout.writestr(fname, content)
-    
+
+    z_tmpl.close()
+    z_src.close()
+
+    print(f"  ✅ Convertido: {output_path}")
    return output_path

+
+# ======================================================================
+# Entry point
+# ======================================================================
+
 if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Uso: apply_template.py <documento.docx> <plantilla.docx>")
+        print("")
+        print("  Salida: <documento>_r360mx.docx (en el mismo directorio)")
        sys.exit(1)
-    docx_path = sys.argv[1]; template_path = sys.argv[2]
-    base_dir = os.path.dirname(docx_path)
+
+    docx_path = sys.argv[1]
+    template_path = sys.argv[2]
+
+    if not os.path.exists(docx_path):
+        print(f"❌ No existe: {docx_path}")
+        sys.exit(1)
+    if not os.path.exists(template_path):
+        print(f"❌ No existe: {template_path}")
+        sys.exit(1)
+
+    base_dir = os.path.dirname(os.path.abspath(docx_path))
    base_name = os.path.splitext(os.path.basename(docx_path))[0]
    output_path = os.path.join(base_dir, f"{base_name}_r360mx.docx")
+
    print(f"📄 Template: {template_path}")
    print(f"📄 Documento: {docx_path}")
-    print(f"📄 Salida: {output_path}")
-    replace_content(template_path, docx_path, output_path)
-    print(f"✅ Convertido: {output_path}")
-    import subprocess
-    subprocess.Popen(['nextcloudcmd', '--non-interactive', '--user', 'JavierBrana', '--password', '%5qJuIrZ^eoq3rFYU$OpuV2aM', '/home/javi/Nextcloud', 'https://cloud.r360mx.com'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    print(f"📄 Salida:    {output_path}")
+    print()
+
+    replace_content(template_path, docx_path, output_path)