apply_template v2: fix sobrescritura imágenes template + mapeo estilos Title1->Título 1

This commit is contained in:
Javier Braña
2026-05-05 01:42:09 +02:00
parent 69e993e5cf
commit d162b7667a
+92 -158
View File
@@ -1,45 +1,41 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Aplica plantilla portada.docx a un ENERGY REPORT. apply_template.py v2 - Conversión de ENERGY REPORT a formato R360MX.
Estrategia:
1. Partir del DOCX TEMPLATE como base. Correcciones respecto a v1:
2. Copiar imágenes del DOCX ORIGINAL al template, renombrándolas - Las imágenes del template NUNCA se sobrescriben (se parte del template y
desde el último número de imagen del template +1. las imágenes del source se renombran con numeración que evita colisiones).
3. Actualizar las referencias a imágenes en document.xml y - Los estilos de párrafo del source (p.ej. "Title1") se mapean a los estilos
document.xml.rels para que apunten a los nuevos nombres. del template ("Título 1").
""" """
import sys, os, zipfile, re, copy import sys, os, zipfile, re, copy
from lxml import etree from lxml import etree
# Espacios de nombres OOXML
w = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' w = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
wp = 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
a = 'http://schemas.openxmlformats.org/drawingml/2006/main' a = 'http://schemas.openxmlformats.org/drawingml/2006/main'
pic = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
rel_type_image = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
# ====================================================================== # ======================================================================
# Funciones auxiliares # Mapeo de estilos: source -> template
# ====================================================================== # ======================================================================
STYLE_MAP = {
'Title1': 'Título 1',
'Title2Index': 'Title2Index', # mantener igual
'TableContentEnd': 'TableContentEnd',
}
def parse_xml(content): def parse_xml(content):
return etree.fromstring(content) return etree.fromstring(content)
def read_zip_entry(z, path):
return z.read(path)
def write_zip_entry(zout, name, content):
zout.writestr(name, content)
def get_image_number(filename): def get_image_number(filename):
"""Extrae el número de una imagen tipo 'word/media/image42.png'."""
m = re.search(r'image(\d+)\.', filename) m = re.search(r'image(\d+)\.', filename)
return int(m.group(1)) if m else 0 return int(m.group(1)) if m else 0
def find_max_image_id(z): def find_max_image_id(z):
"""Encuentra el número de imagen más alto dentro del zip."""
max_id = 0 max_id = 0
for name in z.namelist(): for name in z.namelist():
n = get_image_number(name) n = get_image_number(name)
@@ -47,6 +43,7 @@ def find_max_image_id(z):
max_id = n max_id = n
return max_id return max_id
def find_content_start(children): def find_content_start(children):
"""Encuentra primer título de contenido real (después del índice).""" """Encuentra primer título de contenido real (después del índice)."""
found_toc = False found_toc = False
@@ -62,7 +59,6 @@ def find_content_start(children):
if found_toc and sval == 'Title1' and text and (text[0].isdigit() or text[0] in 'IVX'): if found_toc and sval == 'Title1' and text and (text[0].isdigit() or text[0] in 'IVX'):
if '. ' in text[:6] or text[-1].isdigit(): if '. ' in text[:6] or text[-1].isdigit():
return i return i
# Fallback
for i, child in enumerate(children): for i, child in enumerate(children):
if child.tag == f'{{{w}}}p': if child.tag == f'{{{w}}}p':
style = child.find(f'.//{{{w}}}pStyle') style = child.find(f'.//{{{w}}}pStyle')
@@ -74,16 +70,31 @@ def find_content_start(children):
return 69 return 69
def remap_styles(xml_root, style_map):
"""
Recorre el XML y cambia los párrafos que usan estilos del source
a los correspondientes estilos del template.
"""
changes = 0
for p in xml_root.iter(f'{{{w}}}p'):
pPr = p.find(f'{{{w}}}pPr')
if pPr is None:
continue
pStyle = pPr.find(f'{{{w}}}pStyle')
if pStyle is None:
continue
old_val = pStyle.get(f'{{{w}}}val')
if old_val in style_map:
new_val = style_map[old_val]
if new_val:
pStyle.set(f'{{{w}}}val', new_val)
changes += 1
return changes
def collect_image_refs(xml_root): def collect_image_refs(xml_root):
""" """Encuentra todos los a:blip con r:embed."""
Escanea el XML y encuentra TODAS las referencias a imágenes:
- wp:extent (tamaño)
- a:blip r:embed (relación)
- pic:blipFill (relleno de imagen)
Devuelve lista de (elemento_blip, rId).
"""
blips = [] blips = []
nsmap = {'a': a, 'r': r, 'pic': pic, 'wp': wp}
for blip in xml_root.iter(f'{{{a}}}blip'): for blip in xml_root.iter(f'{{{a}}}blip'):
rid = blip.get(f'{{{r}}}embed') rid = blip.get(f'{{{r}}}embed')
if rid: if rid:
@@ -91,37 +102,26 @@ def collect_image_refs(xml_root):
return blips return blips
# ======================================================================
# Función principal
# ======================================================================
def replace_content(template_path, source_docx_path, output_path): def replace_content(template_path, source_docx_path, output_path):
"""
Aplica el template al documento fuente y escribe el resultado.
1. Abre template y fuente como zips.
2. Detecta límites en ambos XML (portada, índice, contraportada).
3. Fusiona los bodies: portada+índice del template + contenido del source + contraportada del template.
4. Renombra imágenes del source y actualiza referencias.
5. Escribe el nuevo DOCX.
"""
z_tmpl = zipfile.ZipFile(template_path, 'r') z_tmpl = zipfile.ZipFile(template_path, 'r')
z_src = zipfile.ZipFile(source_docx_path, 'r') z_src = zipfile.ZipFile(source_docx_path, 'r')
# ---- Leer XMLs ---- # ---- Leer XMLs ----
tmpl_xml = parse_xml(read_zip_entry(z_tmpl, 'word/document.xml')) tmpl_xml = parse_xml(z_tmpl.read('word/document.xml'))
src_xml = parse_xml(read_zip_entry(z_src, 'word/document.xml')) src_xml = parse_xml(z_src.read('word/document.xml'))
tmpl_rel = parse_xml(read_zip_entry(z_tmpl, 'word/_rels/document.xml.rels')) tmpl_rel = parse_xml(z_tmpl.read('word/_rels/document.xml.rels'))
src_rel = parse_xml(read_zip_entry(z_src, 'word/_rels/document.xml.rels')) src_rel = parse_xml(z_src.read('word/_rels/document.xml.rels'))
body_tmpl = tmpl_xml.find(f'{{{w}}}body') body_tmpl = tmpl_xml.find(f'{{{w}}}body')
body_src = src_xml.find(f'{{{w}}}body') body_src = src_xml.find(f'{{{w}}}body')
children_tmpl = list(body_tmpl) children_tmpl = list(body_tmpl)
children_src = list(body_src) children_src = list(body_src)
# ---- Detectar límites en el template ---- # ---- Remapear estilos en el XML del source (antes de fusionar) ----
# Índice style_changes = remap_styles(src_xml, STYLE_MAP)
print(f" Estilos reasignados: {style_changes}")
# ---- Detectar límites ----
tmpl_idx_end = 36 tmpl_idx_end = 36
for i, child in enumerate(children_tmpl): for i, child in enumerate(children_tmpl):
if child.tag == f'{{{w}}}p': if child.tag == f'{{{w}}}p':
@@ -129,12 +129,11 @@ def replace_content(template_path, source_docx_path, output_path):
sval = style.get(f'{{{w}}}val') if style is not None else '' sval = style.get(f'{{{w}}}val') if style is not None else ''
if sval == 'TableContentEnd': if sval == 'TableContentEnd':
tmpl_idx_end = i tmpl_idx_end = i
elif sval == 'Ttulo1' and i > tmpl_idx_end: elif sval == 'Título 1' and i > tmpl_idx_end:
break break
if tmpl_idx_end < 10: if tmpl_idx_end < 10:
tmpl_idx_end = 36 tmpl_idx_end = 36
# Contraportada
tmpl_back = 47 tmpl_back = 47
for i, child in enumerate(children_tmpl): for i, child in enumerate(children_tmpl):
if child.tag == f'{{{w}}}p': if child.tag == f'{{{w}}}p':
@@ -143,29 +142,24 @@ def replace_content(template_path, source_docx_path, output_path):
tmpl_back = i tmpl_back = i
break break
# ---- Detectar dónde empieza el contenido real en el source ----
src_start = find_content_start(children_src) src_start = find_content_start(children_src)
print(f" Template: índice h. {tmpl_idx_end}, contraportada h. {tmpl_back}")
print(f" Template: índice termina en hijo {tmpl_idx_end}, contraportada en hijo {tmpl_back}") print(f" Source: contenido real empieza en hijo {src_start}")
print(f" Original: contenido real empieza en hijo {src_start}")
# ---- Renombrar imágenes del source ---- # ---- Renombrar imágenes del source ----
max_tmpl_img = find_max_image_id(z_tmpl) max_tmpl_img = find_max_image_id(z_tmpl)
print(f" Max imagen en template: {max_tmpl_img}") print(f" Max imagen en template: {max_tmpl_img}")
# Mapa: old_rel_path -> new_rel_path (ej: media/image1.png -> media/image5.png) # Imágenes que ya existen en el template (no se tocan)
image_rename_map = {} existing_tmpl_media = set()
# Mapa: old_rId -> new_rId (para actualizar relaciones) for name in z_tmpl.namelist():
rid_rename_map = {} if name.startswith('word/media/'):
existing_tmpl_media.add(name)
# 1. Construir un set de rIds que YA existen en el template image_rename_map = {} # old_abs -> new_abs
existing_tmpl_rids = set() rid_rename_map = {} # old_rId -> new_rId
for rel in tmpl_rel:
rid = rel.get('Id')
if rid:
existing_tmpl_rids.add(rid)
# 2. Identificar rIds de imágenes en el source # Identificar rIds de imágenes en el source
src_rids = {} src_rids = {}
for rel in src_rel: for rel in src_rel:
rid = rel.get('Id') rid = rel.get('Id')
@@ -174,136 +168,91 @@ def replace_content(template_path, source_docx_path, output_path):
if 'image' in rel_type: if 'image' in rel_type:
src_rids[rid] = target src_rids[rid] = target
print(f" Imágenes en source: {len(src_rids)}") # Generar nuevos nombres SIN colisionar con template
generated = set()
# 3. Generar nuevos nombres y rIds para las imágenes del source
# Debemos evitar colisiones: ni con imágenes del template ni entre imágenes
# del source después de renombrar.
new_rid_counter = 1
all_old_new_rids = {} # old_rId -> new_rId
# Pre-calcular: set de nombres de imagen que ya existen en el template
existing_tmpl_media = set()
for name in z_tmpl.namelist():
if name.startswith('word/media/') and get_image_number(name) > 0:
existing_tmpl_media.add(name)
# Pre-calcular: los nuevos nombres que vamos generando (para evitar colisiones internas)
generated_new_names = set()
# Ordenar por old_num ascendente para consistencia
src_items = [] src_items = []
for old_rid, rel_target in src_rids.items(): for old_rid, rel_target in src_rids.items():
old_rel_path = rel_target rel_path = rel_target.replace('../', '')
if old_rel_path.startswith('..'): old_abs = f'word/{rel_path}' if not rel_path.startswith('word/') else rel_path
old_rel_path = old_rel_path.replace('../', '') old_num = get_image_number(old_abs)
old_abs = f'word/{old_rel_path}' if not old_rel_path.startswith('word/') else old_rel_path
old_num = get_image_number(old_abs) if old_abs.startswith('word/media/') else 0
src_items.append((old_num, old_rid, old_abs)) src_items.append((old_num, old_rid, old_abs))
src_items.sort() src_items.sort()
for old_num, old_rid, old_abs in src_items: for old_num, old_rid, old_abs in src_items:
if not old_abs.startswith('word/media/'):
continue
ext = old_abs.rsplit('.', 1)[1] ext = old_abs.rsplit('.', 1)[1]
candidate = max_tmpl_img + old_num
# Buscar el primer número disponible: new_abs = f'word/media/image{candidate}.{ext}'
# empezar desde max_tmpl_img + old_num, pero si colisiona, incrementar while new_abs in existing_tmpl_media or new_abs in generated:
candidate_num = max_tmpl_img + old_num candidate += 1
candidate_abs = f'word/media/image{candidate_num}.{ext}' new_abs = f'word/media/image{candidate}.{ext}'
while candidate_abs in existing_tmpl_media or candidate_abs in generated_new_names:
candidate_num += 1
candidate_abs = f'word/media/image{candidate_num}.{ext}'
new_abs = candidate_abs
new_num = candidate_num
image_rename_map[old_abs] = new_abs image_rename_map[old_abs] = new_abs
generated_new_names.add(new_abs) generated.add(new_abs)
rid_rename_map[old_rid] = f'rId{candidate}'
# Generar nuevo rId (evitar colisión con template) print(f" {old_abs} -> {new_abs}")
new_rid = f'rIdImage{new_num}'
while new_rid in existing_tmpl_rids:
new_rid_counter += 1
new_rid = f'rIdImage{new_num}_{new_rid_counter}'
all_old_new_rids[old_rid] = new_rid
print(f" {old_abs} -> {new_abs} (rId: {old_rid} -> {new_rid})")
# ---- Fusionar bodies ---- # ---- Fusionar bodies ----
# Vaciar body del template y reconstruirlo: # Vaciar template y reconstruir:
# [portada+disclaimer+índice del template] # [portada+disclaimer+índice del template]
# + [contenido del source (desde src_start)] # + [contenido del source (desde src_start, con estilos remapeados)]
# + [contraportada del template] # + [contraportada del template]
for child in list(body_tmpl): for child in list(body_tmpl):
body_tmpl.remove(child) body_tmpl.remove(child)
# Portada + índice del template
for child in children_tmpl[:tmpl_idx_end + 1]: for child in children_tmpl[:tmpl_idx_end + 1]:
body_tmpl.append(copy.deepcopy(child)) body_tmpl.append(copy.deepcopy(child))
# Contenido del source (sin sectPr)
for child in children_src[src_start:]: for child in children_src[src_start:]:
if child.tag != f'{{{w}}}sectPr': if child.tag != f'{{{w}}}sectPr':
body_tmpl.append(copy.deepcopy(child)) body_tmpl.append(copy.deepcopy(child))
# Contraportada del template
for child in children_tmpl[tmpl_back:]: for child in children_tmpl[tmpl_back:]:
body_tmpl.append(copy.deepcopy(child)) body_tmpl.append(copy.deepcopy(child))
# ---- Actualizar referencias de imagen en document.xml ---- # ---- Actualizar rIds en document.xml ----
# Buscar todos los a:blip y actualizar rId
for blip, old_rid in collect_image_refs(tmpl_xml): for blip, old_rid in collect_image_refs(tmpl_xml):
if old_rid in all_old_new_rids: if old_rid in rid_rename_map:
blip.set(f'{{{r}}}embed', all_old_new_rids[old_rid]) blip.set(f'{{{r}}}embed', rid_rename_map[old_rid])
# ---- Construir zip de salida ---- # ---- Construir zip de salida ----
out_data = {} out_data = {}
# 1. Copiar todo el template (base) # 1. PARTIR DEL TEMPLATE (sus imágenes NUNCA se tocan)
for item in z_tmpl.infolist(): for item in z_tmpl.infolist():
out_data[item.filename] = z_tmpl.read(item.filename) out_data[item.filename] = z_tmpl.read(item.filename)
# 2. Copiar imágenes renombradas del source # 2. Añadir imágenes del source renombradas
for old_abs, new_abs in image_rename_map.items(): for old_abs, new_abs in image_rename_map.items():
if old_abs in out_data: content = z_src.read(old_abs)
# La imagen del template tiene el mismo nombre -> la renombramos out_data[new_abs] = content
out_data[new_abs] = z_src.read(old_abs)
else:
# No hay conflicto, copiamos directamente
out_data[new_abs] = z_src.read(old_abs)
# 3. Actualizar document.xml.rels: añadir nuevas relaciones de imagen # 3. Añadir relaciones de imágenes del source
rel_root = parse_xml(out_data['word/_rels/document.xml.rels']) rel_root = parse_xml(out_data['word/_rels/document.xml.rels'])
existing_rids = set()
for rel in rel_root:
rid = rel.get('Id')
if rid:
existing_rids.add(rid)
for old_rid, new_rid in all_old_new_rids.items(): for old_rid, new_rid in rid_rename_map.items():
# Buscar la relación original en el source if new_rid in existing_rids:
continue
for rel in src_rel: for rel in src_rel:
if rel.get('Id') == old_rid: if rel.get('Id') == old_rid:
target = rel.get('Target', '') target = rel.get('Target', '')
# Actualizar target al nuevo nombre de imagen
old_target_abs = target.replace('../', '') old_target_abs = target.replace('../', '')
if not old_target_abs.startswith('word/'): if not old_target_abs.startswith('word/'):
old_target_abs = f'word/{old_target_abs}' old_target_abs = f'word/{old_target_abs}'
new_target_abs = image_rename_map.get(old_target_abs, old_target_abs) new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
# Convertir a ruta relativa desde word/ new_target = new_target_abs[5:] if new_target_abs.startswith('word/') else new_target_abs
if new_target_abs.startswith('word/'):
new_target = new_target_abs[5:]
else:
new_target = new_target_abs
# Crear nueva relación
new_rel = copy.deepcopy(rel) new_rel = copy.deepcopy(rel)
new_rel.set('Id', new_rid) new_rel.set('Id', new_rid)
new_rel.set('Target', new_target) new_rel.set('Target', new_target)
rel_root.append(new_rel) rel_root.append(new_rel)
existing_rids.add(new_rid)
break break
out_data['word/_rels/document.xml.rels'] = etree.tostring( out_data['word/_rels/document.xml.rels'] = etree.tostring(
rel_root, xml_declaration=True, encoding='UTF-8', standalone=True) rel_root, xml_declaration=True, encoding='UTF-8', standalone=True)
# 4. Actualizar document.xml
out_data['word/document.xml'] = etree.tostring( out_data['word/document.xml'] = etree.tostring(
tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True) tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True)
@@ -314,32 +263,17 @@ def replace_content(template_path, source_docx_path, output_path):
z_tmpl.close() z_tmpl.close()
z_src.close() z_src.close()
print(f" ✅ Convertido: {output_path}") print(f" ✅ Convertido: {output_path}")
return output_path return output_path
# ======================================================================
# Entry point
# ======================================================================
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) < 3: if len(sys.argv) < 3:
print("Uso: apply_template.py <documento.docx> <plantilla.docx>") print("Uso: apply_template.py <documento.docx> <plantilla.docx>")
print("")
print(" Salida: <documento>_r360mx.docx (en el mismo directorio)")
sys.exit(1) sys.exit(1)
docx_path = sys.argv[1] docx_path = sys.argv[1]
template_path = sys.argv[2] template_path = sys.argv[2]
if not os.path.exists(docx_path):
print(f"❌ No existe: {docx_path}")
sys.exit(1)
if not os.path.exists(template_path):
print(f"❌ No existe: {template_path}")
sys.exit(1)
base_dir = os.path.dirname(os.path.abspath(docx_path)) base_dir = os.path.dirname(os.path.abspath(docx_path))
base_name = os.path.splitext(os.path.basename(docx_path))[0] base_name = os.path.splitext(os.path.basename(docx_path))[0]
output_path = os.path.join(base_dir, f"{base_name}_r360mx.docx") output_path = os.path.join(base_dir, f"{base_name}_r360mx.docx")