286 lines
10 KiB
Python
286 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
apply_template.py v2 - Conversión de ENERGY REPORT a formato R360MX.
|
|
|
|
Correcciones respecto a v1:
|
|
- Las imágenes del template NUNCA se sobrescriben (se parte del template y
|
|
las imágenes del source se renombran con numeración que evita colisiones).
|
|
- Los estilos de párrafo del source (p.ej. "Title1") se mapean a los estilos
|
|
del template ("Título 1").
|
|
"""
|
|
|
|
import sys, os, zipfile, re, copy
|
|
from lxml import etree
|
|
|
|
w = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
|
r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
|
a = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
|
|
|
# ======================================================================
|
|
# Mapeo de estilos: source -> template
|
|
# ======================================================================
|
|
STYLE_MAP = {
|
|
'Title1': 'Título 1',
|
|
'Title2Index': 'Title2Index', # mantener igual
|
|
'TableContentEnd': 'TableContentEnd',
|
|
}
|
|
|
|
|
|
def parse_xml(content):
|
|
return etree.fromstring(content)
|
|
|
|
|
|
def get_image_number(filename):
|
|
m = re.search(r'image(\d+)\.', filename)
|
|
return int(m.group(1)) if m else 0
|
|
|
|
|
|
def find_max_image_id(z):
|
|
max_id = 0
|
|
for name in z.namelist():
|
|
n = get_image_number(name)
|
|
if n > max_id:
|
|
max_id = n
|
|
return max_id
|
|
|
|
|
|
def find_content_start(children):
|
|
"""Encuentra primer título de contenido real (después del índice)."""
|
|
found_toc = False
|
|
for i, child in enumerate(children):
|
|
if child.tag == f'{{{w}}}p':
|
|
style = child.find(f'.//{{{w}}}pStyle')
|
|
sval = style.get(f'{{{w}}}val') if style is not None else ''
|
|
texts = child.findall(f'.//{{{w}}}t')
|
|
text = ''.join(t.text or '' for t in texts)
|
|
if sval == 'Title2Index':
|
|
found_toc = True
|
|
continue
|
|
if found_toc and sval == 'Title1' and text and (text[0].isdigit() or text[0] in 'IVX'):
|
|
if '. ' in text[:6] or text[-1].isdigit():
|
|
return i
|
|
for i, child in enumerate(children):
|
|
if child.tag == f'{{{w}}}p':
|
|
style = child.find(f'.//{{{w}}}pStyle')
|
|
sval = style.get(f'{{{w}}}val') if style is not None else ''
|
|
texts = child.findall(f'.//{{{w}}}t')
|
|
text = ''.join(t.text or '' for t in texts)
|
|
if sval == 'Title1' and text and text[0].isdigit() and '. ' in text[:6]:
|
|
return i
|
|
return 69
|
|
|
|
|
|
def remap_styles(xml_root, style_map):
|
|
"""
|
|
Recorre el XML y cambia los párrafos que usan estilos del source
|
|
a los correspondientes estilos del template.
|
|
"""
|
|
changes = 0
|
|
for p in xml_root.iter(f'{{{w}}}p'):
|
|
pPr = p.find(f'{{{w}}}pPr')
|
|
if pPr is None:
|
|
continue
|
|
pStyle = pPr.find(f'{{{w}}}pStyle')
|
|
if pStyle is None:
|
|
continue
|
|
old_val = pStyle.get(f'{{{w}}}val')
|
|
if old_val in style_map:
|
|
new_val = style_map[old_val]
|
|
if new_val:
|
|
pStyle.set(f'{{{w}}}val', new_val)
|
|
changes += 1
|
|
return changes
|
|
|
|
|
|
def collect_image_refs(xml_root):
|
|
"""Encuentra todos los a:blip con r:embed."""
|
|
blips = []
|
|
for blip in xml_root.iter(f'{{{a}}}blip'):
|
|
rid = blip.get(f'{{{r}}}embed')
|
|
if rid:
|
|
blips.append((blip, rid))
|
|
return blips
|
|
|
|
|
|
def replace_content(template_path, source_docx_path, output_path):
|
|
z_tmpl = zipfile.ZipFile(template_path, 'r')
|
|
z_src = zipfile.ZipFile(source_docx_path, 'r')
|
|
|
|
# ---- Leer XMLs ----
|
|
tmpl_xml = parse_xml(z_tmpl.read('word/document.xml'))
|
|
src_xml = parse_xml(z_src.read('word/document.xml'))
|
|
tmpl_rel = parse_xml(z_tmpl.read('word/_rels/document.xml.rels'))
|
|
src_rel = parse_xml(z_src.read('word/_rels/document.xml.rels'))
|
|
|
|
body_tmpl = tmpl_xml.find(f'{{{w}}}body')
|
|
body_src = src_xml.find(f'{{{w}}}body')
|
|
children_tmpl = list(body_tmpl)
|
|
children_src = list(body_src)
|
|
|
|
# ---- Remapear estilos en el XML del source (antes de fusionar) ----
|
|
style_changes = remap_styles(src_xml, STYLE_MAP)
|
|
print(f" Estilos reasignados: {style_changes}")
|
|
|
|
# ---- Detectar límites ----
|
|
tmpl_idx_end = 36
|
|
for i, child in enumerate(children_tmpl):
|
|
if child.tag == f'{{{w}}}p':
|
|
style = child.find(f'.//{{{w}}}pStyle')
|
|
sval = style.get(f'{{{w}}}val') if style is not None else ''
|
|
if sval == 'TableContentEnd':
|
|
tmpl_idx_end = i
|
|
elif sval == 'Título 1' and i > tmpl_idx_end:
|
|
break
|
|
if tmpl_idx_end < 10:
|
|
tmpl_idx_end = 36
|
|
|
|
tmpl_back = 47
|
|
for i, child in enumerate(children_tmpl):
|
|
if child.tag == f'{{{w}}}p':
|
|
texts = child.findall(f'.//{{{w}}}t')
|
|
if 'RENOVABLES 360' in ''.join(t.text or '' for t in texts):
|
|
tmpl_back = i
|
|
break
|
|
|
|
src_start = find_content_start(children_src)
|
|
print(f" Template: índice h. {tmpl_idx_end}, contraportada h. {tmpl_back}")
|
|
print(f" Source: contenido real empieza en hijo {src_start}")
|
|
|
|
# ---- Renombrar imágenes del source ----
|
|
max_tmpl_img = find_max_image_id(z_tmpl)
|
|
print(f" Max imagen en template: {max_tmpl_img}")
|
|
|
|
# Imágenes que ya existen en el template (no se tocan)
|
|
existing_tmpl_media = set()
|
|
for name in z_tmpl.namelist():
|
|
if name.startswith('word/media/'):
|
|
existing_tmpl_media.add(name)
|
|
|
|
image_rename_map = {} # old_abs -> new_abs
|
|
rid_rename_map = {} # old_rId -> new_rId
|
|
|
|
# Identificar rIds de imágenes en el source
|
|
src_rids = {}
|
|
for rel in src_rel:
|
|
rid = rel.get('Id')
|
|
target = rel.get('Target', '').replace('\\', '/')
|
|
rel_type = rel.get('Type', '')
|
|
if 'image' in rel_type:
|
|
src_rids[rid] = target
|
|
|
|
# Generar nuevos nombres SIN colisionar con template
|
|
generated = set()
|
|
src_items = []
|
|
for old_rid, rel_target in src_rids.items():
|
|
rel_path = rel_target.replace('../', '')
|
|
old_abs = f'word/{rel_path}' if not rel_path.startswith('word/') else rel_path
|
|
old_num = get_image_number(old_abs)
|
|
src_items.append((old_num, old_rid, old_abs))
|
|
src_items.sort()
|
|
|
|
for old_num, old_rid, old_abs in src_items:
|
|
ext = old_abs.rsplit('.', 1)[1]
|
|
candidate = max_tmpl_img + old_num
|
|
new_abs = f'word/media/image{candidate}.{ext}'
|
|
while new_abs in existing_tmpl_media or new_abs in generated:
|
|
candidate += 1
|
|
new_abs = f'word/media/image{candidate}.{ext}'
|
|
image_rename_map[old_abs] = new_abs
|
|
generated.add(new_abs)
|
|
rid_rename_map[old_rid] = f'rId{candidate}'
|
|
print(f" {old_abs} -> {new_abs}")
|
|
|
|
# ---- Fusionar bodies ----
|
|
# Vaciar template y reconstruir:
|
|
# [portada+disclaimer+índice del template]
|
|
# + [contenido del source (desde src_start, con estilos remapeados)]
|
|
# + [contraportada del template]
|
|
for child in list(body_tmpl):
|
|
body_tmpl.remove(child)
|
|
|
|
for child in children_tmpl[:tmpl_idx_end + 1]:
|
|
body_tmpl.append(copy.deepcopy(child))
|
|
|
|
for child in children_src[src_start:]:
|
|
if child.tag != f'{{{w}}}sectPr':
|
|
body_tmpl.append(copy.deepcopy(child))
|
|
|
|
for child in children_tmpl[tmpl_back:]:
|
|
body_tmpl.append(copy.deepcopy(child))
|
|
|
|
# ---- Actualizar rIds en document.xml ----
|
|
for blip, old_rid in collect_image_refs(tmpl_xml):
|
|
if old_rid in rid_rename_map:
|
|
blip.set(f'{{{r}}}embed', rid_rename_map[old_rid])
|
|
|
|
# ---- Construir zip de salida ----
|
|
out_data = {}
|
|
|
|
# 1. PARTIR DEL TEMPLATE (sus imágenes NUNCA se tocan)
|
|
for item in z_tmpl.infolist():
|
|
out_data[item.filename] = z_tmpl.read(item.filename)
|
|
|
|
# 2. Añadir imágenes del source renombradas
|
|
for old_abs, new_abs in image_rename_map.items():
|
|
content = z_src.read(old_abs)
|
|
out_data[new_abs] = content
|
|
|
|
# 3. Añadir relaciones de imágenes del source
|
|
rel_root = parse_xml(out_data['word/_rels/document.xml.rels'])
|
|
existing_rids = set()
|
|
for rel in rel_root:
|
|
rid = rel.get('Id')
|
|
if rid:
|
|
existing_rids.add(rid)
|
|
|
|
for old_rid, new_rid in rid_rename_map.items():
|
|
if new_rid in existing_rids:
|
|
continue
|
|
for rel in src_rel:
|
|
if rel.get('Id') == old_rid:
|
|
target = rel.get('Target', '')
|
|
old_target_abs = target.replace('../', '')
|
|
if not old_target_abs.startswith('word/'):
|
|
old_target_abs = f'word/{old_target_abs}'
|
|
new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
|
|
new_target = new_target_abs[5:] if new_target_abs.startswith('word/') else new_target_abs
|
|
new_rel = copy.deepcopy(rel)
|
|
new_rel.set('Id', new_rid)
|
|
new_rel.set('Target', new_target)
|
|
rel_root.append(new_rel)
|
|
existing_rids.add(new_rid)
|
|
break
|
|
|
|
out_data['word/_rels/document.xml.rels'] = etree.tostring(
|
|
rel_root, xml_declaration=True, encoding='UTF-8', standalone=True)
|
|
out_data['word/document.xml'] = etree.tostring(
|
|
tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True)
|
|
|
|
# ---- Escribir ----
|
|
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zout:
|
|
for fname, content in out_data.items():
|
|
zout.writestr(fname, content)
|
|
|
|
z_tmpl.close()
|
|
z_src.close()
|
|
print(f" ✅ Convertido: {output_path}")
|
|
return output_path
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 3:
|
|
print("Uso: apply_template.py <documento.docx> <plantilla.docx>")
|
|
sys.exit(1)
|
|
|
|
docx_path = sys.argv[1]
|
|
template_path = sys.argv[2]
|
|
base_dir = os.path.dirname(os.path.abspath(docx_path))
|
|
base_name = os.path.splitext(os.path.basename(docx_path))[0]
|
|
output_path = os.path.join(base_dir, f"{base_name}_r360mx.docx")
|
|
|
|
print(f"📄 Template: {template_path}")
|
|
print(f"📄 Documento: {docx_path}")
|
|
print(f"📄 Salida: {output_path}")
|
|
print()
|
|
|
|
replace_content(template_path, docx_path, output_path) |