#!/usr/bin/env python3 """ apply_template.py v2 - Conversión de ENERGY REPORT a formato R360MX. Correcciones respecto a v1: - Las imágenes del template NUNCA se sobrescriben (se parte del template y las imágenes del source se renombran con numeración que evita colisiones). - Los estilos de párrafo del source (p.ej. "Title1") se mapean a los estilos del template ("Título 1"). """ import sys, os, zipfile, re, copy from lxml import etree w = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' a = 'http://schemas.openxmlformats.org/drawingml/2006/main' # ====================================================================== # Mapeo de estilos: source -> template # ====================================================================== STYLE_MAP = { 'Title1': 'Título 1', 'Title2Index': 'Title2Index', # mantener igual 'TableContentEnd': 'TableContentEnd', } def parse_xml(content): return etree.fromstring(content) def get_image_number(filename): m = re.search(r'image(\d+)\.', filename) return int(m.group(1)) if m else 0 def find_max_image_id(z): max_id = 0 for name in z.namelist(): n = get_image_number(name) if n > max_id: max_id = n return max_id def find_content_start(children): """Encuentra primer título de contenido real (después del índice).""" found_toc = False for i, child in enumerate(children): if child.tag == f'{{{w}}}p': style = child.find(f'.//{{{w}}}pStyle') sval = style.get(f'{{{w}}}val') if style is not None else '' texts = child.findall(f'.//{{{w}}}t') text = ''.join(t.text or '' for t in texts) if sval == 'Title2Index': found_toc = True continue if found_toc and sval == 'Title1' and text and (text[0].isdigit() or text[0] in 'IVX'): if '. ' in text[:6] or text[-1].isdigit(): return i for i, child in enumerate(children): if child.tag == f'{{{w}}}p': style = child.find(f'.//{{{w}}}pStyle') sval = style.get(f'{{{w}}}val') if style is not None else '' texts = child.findall(f'.//{{{w}}}t') text = ''.join(t.text or '' for t in texts) if sval == 'Title1' and text and text[0].isdigit() and '. ' in text[:6]: return i return 69 def remap_styles(xml_root, style_map): """ Recorre el XML y cambia los párrafos que usan estilos del source a los correspondientes estilos del template. """ changes = 0 for p in xml_root.iter(f'{{{w}}}p'): pPr = p.find(f'{{{w}}}pPr') if pPr is None: continue pStyle = pPr.find(f'{{{w}}}pStyle') if pStyle is None: continue old_val = pStyle.get(f'{{{w}}}val') if old_val in style_map: new_val = style_map[old_val] if new_val: pStyle.set(f'{{{w}}}val', new_val) changes += 1 return changes def collect_image_refs(xml_root): """Encuentra todos los a:blip con r:embed.""" blips = [] for blip in xml_root.iter(f'{{{a}}}blip'): rid = blip.get(f'{{{r}}}embed') if rid: blips.append((blip, rid)) return blips def replace_content(template_path, source_docx_path, output_path): z_tmpl = zipfile.ZipFile(template_path, 'r') z_src = zipfile.ZipFile(source_docx_path, 'r') # ---- Leer XMLs ---- tmpl_xml = parse_xml(z_tmpl.read('word/document.xml')) src_xml = parse_xml(z_src.read('word/document.xml')) tmpl_rel = parse_xml(z_tmpl.read('word/_rels/document.xml.rels')) src_rel = parse_xml(z_src.read('word/_rels/document.xml.rels')) body_tmpl = tmpl_xml.find(f'{{{w}}}body') body_src = src_xml.find(f'{{{w}}}body') children_tmpl = list(body_tmpl) children_src = list(body_src) # ---- Remapear estilos en el XML del source (antes de fusionar) ---- style_changes = remap_styles(src_xml, STYLE_MAP) print(f" Estilos reasignados: {style_changes}") # ---- Detectar límites ---- tmpl_idx_end = 36 for i, child in enumerate(children_tmpl): if child.tag == f'{{{w}}}p': style = child.find(f'.//{{{w}}}pStyle') sval = style.get(f'{{{w}}}val') if style is not None else '' if sval == 'TableContentEnd': tmpl_idx_end = i elif sval == 'Título 1' and i > tmpl_idx_end: break if tmpl_idx_end < 10: tmpl_idx_end = 36 tmpl_back = 47 for i, child in enumerate(children_tmpl): if child.tag == f'{{{w}}}p': texts = child.findall(f'.//{{{w}}}t') if 'RENOVABLES 360' in ''.join(t.text or '' for t in texts): tmpl_back = i break src_start = find_content_start(children_src) print(f" Template: índice h. {tmpl_idx_end}, contraportada h. {tmpl_back}") print(f" Source: contenido real empieza en hijo {src_start}") # ---- Renombrar imágenes del source ---- max_tmpl_img = find_max_image_id(z_tmpl) print(f" Max imagen en template: {max_tmpl_img}") # Imágenes que ya existen en el template (no se tocan) existing_tmpl_media = set() for name in z_tmpl.namelist(): if name.startswith('word/media/'): existing_tmpl_media.add(name) image_rename_map = {} # old_abs -> new_abs rid_rename_map = {} # old_rId -> new_rId # Identificar rIds de imágenes en el source src_rids = {} for rel in src_rel: rid = rel.get('Id') target = rel.get('Target', '').replace('\\', '/') rel_type = rel.get('Type', '') if 'image' in rel_type: src_rids[rid] = target # Generar nuevos nombres SIN colisionar con template generated = set() src_items = [] for old_rid, rel_target in src_rids.items(): rel_path = rel_target.replace('../', '') old_abs = f'word/{rel_path}' if not rel_path.startswith('word/') else rel_path old_num = get_image_number(old_abs) src_items.append((old_num, old_rid, old_abs)) src_items.sort() for old_num, old_rid, old_abs in src_items: ext = old_abs.rsplit('.', 1)[1] candidate = max_tmpl_img + old_num new_abs = f'word/media/image{candidate}.{ext}' while new_abs in existing_tmpl_media or new_abs in generated: candidate += 1 new_abs = f'word/media/image{candidate}.{ext}' image_rename_map[old_abs] = new_abs generated.add(new_abs) rid_rename_map[old_rid] = f'rId{candidate}' print(f" {old_abs} -> {new_abs}") # ---- Fusionar bodies ---- # Vaciar template y reconstruir: # [portada+disclaimer+índice del template] # + [contenido del source (desde src_start, con estilos remapeados)] # + [contraportada del template] for child in list(body_tmpl): body_tmpl.remove(child) for child in children_tmpl[:tmpl_idx_end + 1]: body_tmpl.append(copy.deepcopy(child)) for child in children_src[src_start:]: if child.tag != f'{{{w}}}sectPr': body_tmpl.append(copy.deepcopy(child)) for child in children_tmpl[tmpl_back:]: body_tmpl.append(copy.deepcopy(child)) # ---- Actualizar rIds en document.xml ---- for blip, old_rid in collect_image_refs(tmpl_xml): if old_rid in rid_rename_map: blip.set(f'{{{r}}}embed', rid_rename_map[old_rid]) # ---- Construir zip de salida ---- out_data = {} # 1. PARTIR DEL TEMPLATE (sus imágenes NUNCA se tocan) for item in z_tmpl.infolist(): out_data[item.filename] = z_tmpl.read(item.filename) # 2. Añadir imágenes del source renombradas for old_abs, new_abs in image_rename_map.items(): content = z_src.read(old_abs) out_data[new_abs] = content # 3. Añadir relaciones de imágenes del source rel_root = parse_xml(out_data['word/_rels/document.xml.rels']) existing_rids = set() for rel in rel_root: rid = rel.get('Id') if rid: existing_rids.add(rid) for old_rid, new_rid in rid_rename_map.items(): if new_rid in existing_rids: continue for rel in src_rel: if rel.get('Id') == old_rid: target = rel.get('Target', '') old_target_abs = target.replace('../', '') if not old_target_abs.startswith('word/'): old_target_abs = f'word/{old_target_abs}' new_target_abs = image_rename_map.get(old_target_abs, old_target_abs) new_target = new_target_abs[5:] if new_target_abs.startswith('word/') else new_target_abs new_rel = copy.deepcopy(rel) new_rel.set('Id', new_rid) new_rel.set('Target', new_target) rel_root.append(new_rel) existing_rids.add(new_rid) break out_data['word/_rels/document.xml.rels'] = etree.tostring( rel_root, xml_declaration=True, encoding='UTF-8', standalone=True) out_data['word/document.xml'] = etree.tostring( tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True) # ---- Escribir ---- with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zout: for fname, content in out_data.items(): zout.writestr(fname, content) z_tmpl.close() z_src.close() print(f" ✅ Convertido: {output_path}") return output_path if __name__ == "__main__": if len(sys.argv) < 3: print("Uso: apply_template.py ") sys.exit(1) docx_path = sys.argv[1] template_path = sys.argv[2] base_dir = os.path.dirname(os.path.abspath(docx_path)) base_name = os.path.splitext(os.path.basename(docx_path))[0] output_path = os.path.join(base_dir, f"{base_name}_r360mx.docx") print(f"📄 Template: {template_path}") print(f"📄 Documento: {docx_path}") print(f"📄 Salida: {output_path}") print() replace_content(template_path, docx_path, output_path)