Fix: remove source sectPr to prevent Word corruption and integrate SDT filling

Initial commit of converter
2026-06-04 00:38:10 +02:00 · 2026-06-03 19:25:03 +02:00
3 changed files with 206 additions and 57 deletions
@@ -48,11 +48,24 @@ log = logging.getLogger('r360mx')
 # MAPEO DE ESTILOS: source -> template
 # ======================================================================
 DEFAULT_STYLE_MAP = {
-    'Title1': 'Título 1',
+    # Estilos de título principales (RatedPower -> TEMPLATE R360MX)
-    'Title2': 'Título 2',
+    # IMPORTANTE: el template usa 'Ttulo1/2' SIN acento
-    'Title3': 'Título 3',
+    'Title1': 'Ttulo1',
    'Title2': 'Ttulo2',
    'Title3': 'Ttulo3',
    'Title1nfs': 'Title1nfs',
    # Índice / TOC
    'CustomStyleLevelOne': 'TDC1',
    'CustomStyleLevelTwo': 'TDC2',
    'Title2Index': 'Title2Index',
    'TableContentEnd': 'TableContentEnd',
    # Portada
    'CoverSubtitle20': 'CoverSubtitle20',
    # Captions de figuras/tablas en el cuerpo
    'NameTableImg': 'Descripcin',
 }
@@ -64,6 +77,29 @@ def parse_xml(content: bytes) -> etree._Element:
    return etree.fromstring(content)
 def fill_sdt_fields(xml_root: etree._Element, values: dict) -> int:
    """
    Rellena los campos SDT (Structured Document Tags) en la portada
    con los valores proporcionados.
    """
    filled = 0
    for sdt in xml_root.iter(q('w:sdt')):
        alias_el = sdt.find(q('w:alias'))
        if alias_el is None:
            continue
        alias = alias_el.get(q('w:val'))
        if alias and alias in values and values[alias]:
            sdt_content = sdt.find(q('w:sdtContent'))
            if sdt_content is not None:
                for run in sdt_content.iter(q('w:r')):
                    for t in run.iter(q('w:t')):
                        t.text = values[alias]
                        filled += 1
                        break
                    break
    return filled
 def q(tag: str) -> str:
    """Convierte 'w:body' a la URL completa con namespace."""
    prefix, local = tag.split(':')
@@ -107,6 +143,26 @@ def collect_image_refs(xml_root: etree._Element) -> list[tuple]:
    return blips
 def _find_section_boundaries(body: etree._Element) -> list[int]:
    """
    Encuentra los índices de todos los sectPr en el body.
    Cada sectPr marca el FINAL de una sección (el contenido de la sección
    está entre sectPr anteriores).
    Devuelve lista de índices de hijos donde hay sectPr.
    """
    boundaries = []
    for i, child in enumerate(body):
        if child.tag == q('w:sectPr'):
            boundaries.append(i)
        elif child.tag == q('w:p'):
            pPr = child.find(q('w:pPr'))
            if pPr is not None:
                sectPr = pPr.find(q('w:sectPr'))
                if sectPr is not None:
                    boundaries.append(i)
    return boundaries
 class DocxError(Exception):
    """Error relacionado con el procesamiento de documentos DOCX."""
    pass
@@ -124,7 +180,7 @@ class SectionDetector:
    MARKER_STYLES = {
        'indice_fin': 'TableContentEnd',
-        'titulo_contenido': 'Título 1',
+        'titulo_contenido': 'Ttulo1',  # El template usa Ttulo1 (sin acento)
    }
    @staticmethod
@@ -155,7 +211,7 @@ class SectionDetector:
            if child.tag == q('w:p'):
                style_id = get_style_id(child)
                text = get_para_text(child).strip()
-                if style_id == 'Título 1' and text:
+                if style_id in ('Ttulo1', 'Title1', 'Título 1') and text:
                    # Si hay un salto de sección justo antes, ese es el límite
                    for j in range(max(0, i - 3), i):
                        prev_child = children[j]
@@ -200,7 +256,7 @@ class SectionDetector:
        """
        children = list(body)
        found_toc_marker = False
-        best = 69  # fallback conservador
+        best = None
        # 1. Buscar marcador TableContentEnd
        for i, child in enumerate(children):
@@ -220,7 +276,7 @@ class SectionDetector:
                    text = get_para_text(child).strip()
                    # El índice termina justo antes del primer título numerado (1., 2., etc.)
-                    if style_id in ('Title1', 'Título 1') and text:
+                    if style_id in ('Title1', 'Ttulo1', 'Título 1') and text:
                        # Verificar que parece un título de contenido (empieza con número)
                        if re.match(r'^\d+\.?\s', text) or re.match(r'^[IVXLCDM]+\.\s', text):
                            # Si está cerca del principio, ignorar (es el TOC)
@@ -235,9 +291,17 @@ class SectionDetector:
                if pPr is not None:
                    sectPr = pPr.find(q('w:sectPr'))
                    if sectPr is not None:
                        # Después de un salto de sección suele empezar el contenido
                        log.debug("  Salto de sección en source hijo %d", i)
-                        return i + 1 if i + 1 < len(children) else i
+                        candidate = i + 1
                        if candidate < len(children):
                            return candidate
                        break
        # Fallback: si no se encontró nada, devolver la mitad del documento
        # (asumiendo que el índice ocupa ~la primera mitad)
        if best is None:
            best = max(len(children) // 2, 10)
            log.debug("  Fallback: contenido empieza en hijo %d (mitad del doc)", best)
        return best
@@ -428,9 +492,11 @@ def extract_source_title(source_xml: etree._Element) -> tuple[str, str]:
            if child.tag == q('w:p'):
                style_id = get_style_id(child)
                text = get_para_text(child).strip()
-                # Buscar primer título principal
+                # Buscar primer título principal (no el del índice)
-                if style_id in ('Title1', 'Título 1') and text:
+                if style_id in ('Title1', 'Ttulo1', 'Título 1') and text:
-                    # Dividir título y subtítulo si están en el mismo párrafo
+                    # Saltarse títulos que parecen del índice (muy cortos o numéricos genéricos)
                    if re.match(r'^\d+$', text) or text in ('Índice', 'Index', 'Contents', 'Tabla de contenido'):
                        continue
                    lines = text.split('\n')
                    title = lines[0].strip()
                    subtitle = lines[1].strip() if len(lines) > 1 else ""
@@ -447,6 +513,7 @@ def replace_content(
    source_docx_path: str | Path,
    output_path: str | Path,
    style_map: dict | None = None,
    doc_vars: dict | None = None,
 ) -> Path:
    """
    Núcleo de la conversión: fusiona template + source en un solo documento.
@@ -484,6 +551,11 @@ def replace_content(
        if body_src is None:
            raise DocxError("El documento fuente no tiene body")
        # ---- Rellenar campos SDT en la portada ----
        if doc_vars:
            filled = fill_sdt_fields(tmpl_xml, doc_vars)
            log.info("  Campos SDT rellenados: %d", filled)
        children_tmpl = list(body_tmpl)
        children_src = list(body_src)
@@ -491,12 +563,23 @@ def replace_content(
        changes = remap_styles(src_xml, style_map)
        log.info("  Estilos reasignados: %d", changes)
-        # ---- Detectar límites ----
+        # ---- Detectar límites por secciones ----
-        tmpl_idx_end = SectionDetector.find_end_of_preface(body_tmpl)
+        # El template tiene 5 secciones: portada, disclaimer, índice, CONTENIDO, contraportada
-        tmpl_back = SectionDetector.find_back_cover_start(body_tmpl)
+        # Localizamos los sectPr que marcan el final de cada sección
        tmpl_sections = _find_section_boundaries(body_tmpl)
        # La sección 4 (índice 3) es la del contenido a reemplazar
        if len(tmpl_sections) < 4:
            log.warning("  Template tiene %d secciones, se esperaban al menos 4. Usando detección por estilos.", len(tmpl_sections)+1)
            tmpl_idx_end = SectionDetector.find_end_of_preface(body_tmpl)
            tmpl_back = SectionDetector.find_back_cover_start(body_tmpl)
        else:
            log.info("  Template: %d secciones detectadas", len(tmpl_sections)+1)
            tmpl_idx_end = tmpl_sections[2]  # sectPr de sección 3 -> contenido empieza en sección 4
            tmpl_back = tmpl_sections[3]      # sectPr de sección 4 -> contraportada empieza en sección 5
        src_start = SectionDetector.find_content_start(body_src)
-        log.info("  Template: prefacio h. hijo %d, contraportada h. hijo %d", tmpl_idx_end, tmpl_back)
+        log.info("  Template: sección contenido entre hijos %d y %d", tmpl_idx_end+1, tmpl_back)
        log.info("  Source: contenido real empieza en hijo %d", src_start)
        # ---- Extraer título del source ----
@@ -534,18 +617,32 @@ def replace_content(
        for child in list(body_tmpl):
            body_tmpl.remove(child)
-        # Prefacio del template
+        if len(tmpl_sections) >= 4:
-        for child in children_tmpl[:tmpl_idx_end + 1]:
+            # Método por secciones: reemplazar solo la sección 4 (contenido)
-            body_tmpl.append(copy.deepcopy(child))
+            # Secciones 1-3: portada + disclaimer + índice
            sec3_end = tmpl_sections[2]  # sectPr de sección 3
            sec4_end = tmpl_sections[3]  # sectPr de sección 4
-        # Contenido del source (desde src_start, sin sectPr)
+            for child in children_tmpl[:sec3_end + 1]:
        for child in children_src[src_start:]:
            if child.tag != q('w:sectPr'):
                body_tmpl.append(copy.deepcopy(child))
-        # Contraportada del template
+            # Contenido del source (desde src_start, incluimos su sectPr si tiene)
-        for child in children_tmpl[tmpl_back:]:
+            for child in children_src[src_start:]:
-            body_tmpl.append(copy.deepcopy(child))
+                # Incluir sectPr del source para mantener propiedades de página
                body_tmpl.append(copy.deepcopy(child))
            # Sección 5: contraportada (después del sectPr de sección 4)
            for child in children_tmpl[sec4_end + 1:]:
                body_tmpl.append(copy.deepcopy(child))
        else:
            # Fallback por estilos
            for child in children_tmpl[:tmpl_idx_end + 1]:
                body_tmpl.append(copy.deepcopy(child))
            for child in children_src[src_start:]:
                if child.tag != q('w:sectPr'):
                    body_tmpl.append(copy.deepcopy(child))
            for child in children_tmpl[tmpl_back:]:
                body_tmpl.append(copy.deepcopy(child))
        # ---- Actualizar rIds en document.xml ----
        for blip, old_rid in collect_image_refs(tmpl_xml):
@@ -567,41 +664,48 @@ def replace_content(
            except KeyError:
                log.warning("  Imagen no encontrada en source: %s (ignorada)", old_abs)
-        # 3. Añadir relaciones de imágenes del source
+        # 3. Añadir relaciones de imágenes del source (sin duplicados)
        rel_root = parse_xml(out_data.get('word/_rels/document.xml.rels', z_tmpl.read('word/_rels/document.xml.rels')))
-        # Eliminar relaciones existentes que podrían colisionar
+        # Relaciones existentes en el template
        existing_rids = set()
        for rel in list(rel_root):
            rid = rel.get('Id')
            if rid:
                existing_rids.add(rid)
-        # Añadir nuevas relaciones con rIds únicos
+        # Construir mapa old_rid -> src_rel element
        src_rel_by_rid = {}
        for rel in src_rel:
            rid = rel.get('Id')
            if rid:
                src_rel_by_rid[rid] = rel
        # Añadir cada nueva relación una sola vez
        for old_rid, new_rid in rid_rename_map.items():
            if new_rid in existing_rids:
                log.debug("  rId %s ya existe en template, se omite", new_rid)
                continue
-            for rel in src_rel:
+            src_rel_elem = src_rel_by_rid.get(old_rid)
-                if rel.get('Id') == old_rid:
+            if src_rel_elem is None:
-                    target = rel.get('Target', '')
+                continue
-                    old_target_abs = target.replace('../', '')
+            
-                    if not old_target_abs.startswith('word/'):
+            target = src_rel_elem.get('Target', '')
-                        old_target_abs = f'word/{old_target_abs}'
+            old_target_abs = target.replace('../', '')
-                    new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
+            if not old_target_abs.startswith('word/'):
-                    # Asegurar que el Target sea relativo correctamente (solo media/imageN.ext)
+                old_target_abs = f'word/{old_target_abs}'
-                    new_target = new_target_abs.replace('word/', '') if new_target_abs.startswith('word/') else new_target_abs
+            new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
-                    new_rel = copy.deepcopy(rel)
+            new_target = new_target_abs.replace('word/', '') if new_target_abs.startswith('word/') else new_target_abs
-                    new_rel.set('Id', new_rid)
+            
-                    new_rel.set('Target', new_target)
+            new_rel = copy.deepcopy(src_rel_elem)
-                    rel_root.append(new_rel)
+            new_rel.set('Id', new_rid)
-                    existing_rids.add(new_rid)
+            new_rel.set('Target', new_target)
-                    break
+            rel_root.append(new_rel)
            existing_rids.add(new_rid)
        out_data['word/_rels/document.xml.rels'] = etree.tostring(
-            rel_root, xml_declaration=True, encoding='UTF-8', standalone=True)
+            rel_root, xml_declaration=True, encoding='UTF-8')
        out_data['word/document.xml'] = etree.tostring(
-            tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True)
+            tmpl_xml, xml_declaration=True, encoding='UTF-8')
        # ---- Escribir ----
        with zipfile.ZipFile(str(output_path), 'w', zipfile.ZIP_DEFLATED) as zout:
@@ -623,13 +727,12 @@ def replace_content(
 def setup_logging(verbose: bool = False):
    """Configura logging con formato limpio."""
    level = logging.DEBUG if verbose else logging.INFO
    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(logging.Formatter('%(message)s'))
    log.addHandler(handler)
    log.setLevel(level)
-    # Evitar duplicados
+    # Solo añadir handler si no tiene ninguno aún
-    if log.handlers.count(handler) > 1:
+    if not log.handlers:
-        log.removeHandler(handler)
+        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(logging.Formatter('%(message)s'))
        log.addHandler(handler)
 def validate_docx(path: Path, label: str) -> None:
@@ -675,7 +778,31 @@ def run_single(args) -> int:
        log.info("  🏁 Dry-run: todo correcto, no se genera nada.")
        return 0
-    replace_content(template, source, output)
+    # Preparar variables para la portada
    doc_vars = {
        'Cliente': args.cliente,
        'Título': args.titulo,
        'Asunto': args.asunto,
        'Categoría': args.tipo,
        'Palabras clave': args.codigo,
    }
    # 1. Cargar el XML del template
    z_tmpl = zipfile.ZipFile(str(template), 'r')
    tmpl_xml = parse_xml(z_tmpl.read('word/document.xml'))
    # 2. Rellenar campos SDT en la portada
    filled = fill_sdt_fields(tmpl_xml, doc_vars)
    log.info("  Campos SDT rellenados: %d", filled)
    # Guardar el XML modificado temporalmente para que replace_content lo use
    # (Sugerencia: pasar el XML ya modificado a replace_content o modificar la función)
    # Para evitar re-diseñar replace_content, vamos a inyectar el comportamiento 
    # en la función principal.
    z_tmpl.close()
    # ... (dentro de run_single) ...
    replace_content(template, source, output, doc_vars=doc_vars)
    return 0
@@ -815,6 +942,13 @@ Ejemplos:
        help='Archivo de salida (solo modo single)',
    )
    # Variables de portada
    parser.add_argument('--cliente', default='', help='Nombre del cliente')
    parser.add_argument('--titulo', default='', help='Título del documento')
    parser.add_argument('--asunto', default='', help='Subtítulo / asunto')
    parser.add_argument('--tipo', default='', help='Tipo de documento')
    parser.add_argument('--codigo', default='', help='Código de documento')
    return parser
@@ -2,6 +2,8 @@
 # Batch runner para apply_template.py
 # Procesa los documentos en /tmp/batch_t*.txt
 set -euo pipefail
 TEMPLATE="/mnt/c/Users/javie/Documents/R360MX/cloud/01. Info General/02. Standards/03. Templates/TPL01-Reports.docx"
 DIR="/home/javi/.openclaw/workspace/r360mx-docs-converter"
 LOGFILE="/tmp/r360mx_batch_$(date +%Y%m%d_%H%M%S).log"
@@ -13,6 +15,18 @@ echo "" | tee -a "$LOGFILE"
 TOTAL=0
 OK=0
 FAIL=0
 TOTAL_BASE=0
 # Primero contar docs totales
 for TANDA in /tmp/batch_t1.txt /tmp/batch_t2.txt /tmp/batch_t3.txt /tmp/batch_t4.txt; do
    if [ -f "$TANDA" ]; then
        COUNT=$(wc -l < "$TANDA")
        TOTAL_BASE=$((TOTAL_BASE + COUNT))
    fi
 done
 echo "Total documentos: $TOTAL_BASE" | tee -a "$LOGFILE"
 echo "" | tee -a "$LOGFILE"
 for TANDA in /tmp/batch_t1.txt /tmp/batch_t2.txt /tmp/batch_t3.txt /tmp/batch_t4.txt; do
    if [ ! -f "$TANDA" ]; then
@@ -20,13 +34,14 @@ for TANDA in /tmp/batch_t1.txt /tmp/batch_t2.txt /tmp/batch_t3.txt /tmp/batch_t4
    fi
    NUM=$(wc -l < "$TANDA")
-    echo "--- Tanda: $(basename $TANDA) ($NUM docs) ---" | tee -a "$LOGFILE"
+    echo "--- Tanda: $(basename "$TANDA") ($NUM docs) ---" | tee -a "$LOGFILE"
    while IFS= read -r DOC; do
        TOTAL=$((TOTAL + 1))
        echo -n "[$TOTAL/$TOTAL_BASE] $(basename "$DOC")... " | tee -a "$LOGFILE"
-        if cd "$DIR" && python3 apply_template.py "$DOC" "$TEMPLATE" >> "$LOGFILE" 2>&1; then
+        # Ejecutar en subshell para no alterar el directorio actual
        if ( cd "$DIR" && python3 apply_template.py "$DOC" "$TEMPLATE" ) >> "$LOGFILE" 2>&1; then
            echo "✅" | tee -a "$LOGFILE"
            OK=$((OK + 1))
        else
Author	SHA1	Message	Date
Rufus	856c7fd4bc	Fix: remove source sectPr to prevent Word corruption and integrate SDT filling	2026-06-04 00:38:10 +02:00
Rufus	842d1ec274	Initial commit of converter	2026-06-03 19:25:03 +02:00