Fix: remove source sectPr to prevent Word corruption and integrate SDT filling

Initial commit of converter
2026-06-04 00:38:10 +02:00 · 2026-06-03 19:25:03 +02:00
3 changed files with 206 additions and 57 deletions
@@ -48,11 +48,24 @@ log = logging.getLogger('r360mx')
 # MAPEO DE ESTILOS: source -> template
 # ======================================================================
 DEFAULT_STYLE_MAP = {
-    'Title1': 'Título 1',
-    'Title2': 'Título 2',
-    'Title3': 'Título 3',
+    # Estilos de título principales (RatedPower -> TEMPLATE R360MX)
+    # IMPORTANTE: el template usa 'Ttulo1/2' SIN acento
+    'Title1': 'Ttulo1',
+    'Title2': 'Ttulo2',
+    'Title3': 'Ttulo3',
+    'Title1nfs': 'Title1nfs',
+
+    # Índice / TOC
+    'CustomStyleLevelOne': 'TDC1',
+    'CustomStyleLevelTwo': 'TDC2',
    'Title2Index': 'Title2Index',
    'TableContentEnd': 'TableContentEnd',
+
+    # Portada
+    'CoverSubtitle20': 'CoverSubtitle20',
+
+    # Captions de figuras/tablas en el cuerpo
+    'NameTableImg': 'Descripcin',
 }


@@ -64,6 +77,29 @@ def parse_xml(content: bytes) -> etree._Element:
    return etree.fromstring(content)


+def fill_sdt_fields(xml_root: etree._Element, values: dict) -> int:
+    """
+    Rellena los campos SDT (Structured Document Tags) en la portada
+    con los valores proporcionados.
+    """
+    filled = 0
+    for sdt in xml_root.iter(q('w:sdt')):
+        alias_el = sdt.find(q('w:alias'))
+        if alias_el is None:
+            continue
+        alias = alias_el.get(q('w:val'))
+        if alias and alias in values and values[alias]:
+            sdt_content = sdt.find(q('w:sdtContent'))
+            if sdt_content is not None:
+                for run in sdt_content.iter(q('w:r')):
+                    for t in run.iter(q('w:t')):
+                        t.text = values[alias]
+                        filled += 1
+                        break
+                    break
+    return filled
+
+
 def q(tag: str) -> str:
    """Convierte 'w:body' a la URL completa con namespace."""
    prefix, local = tag.split(':')
@@ -107,6 +143,26 @@ def collect_image_refs(xml_root: etree._Element) -> list[tuple]:
    return blips


+def _find_section_boundaries(body: etree._Element) -> list[int]:
+    """
+    Encuentra los índices de todos los sectPr en el body.
+    Cada sectPr marca el FINAL de una sección (el contenido de la sección
+    está entre sectPr anteriores).
+    Devuelve lista de índices de hijos donde hay sectPr.
+    """
+    boundaries = []
+    for i, child in enumerate(body):
+        if child.tag == q('w:sectPr'):
+            boundaries.append(i)
+        elif child.tag == q('w:p'):
+            pPr = child.find(q('w:pPr'))
+            if pPr is not None:
+                sectPr = pPr.find(q('w:sectPr'))
+                if sectPr is not None:
+                    boundaries.append(i)
+    return boundaries
+
+
 class DocxError(Exception):
    """Error relacionado con el procesamiento de documentos DOCX."""
    pass
@@ -124,7 +180,7 @@ class SectionDetector:

    MARKER_STYLES = {
        'indice_fin': 'TableContentEnd',
-        'titulo_contenido': 'Título 1',
+        'titulo_contenido': 'Ttulo1',  # El template usa Ttulo1 (sin acento)
    }

    @staticmethod
@@ -155,7 +211,7 @@ class SectionDetector:
            if child.tag == q('w:p'):
                style_id = get_style_id(child)
                text = get_para_text(child).strip()
-                if style_id == 'Título 1' and text:
+                if style_id in ('Ttulo1', 'Title1', 'Título 1') and text:
                    # Si hay un salto de sección justo antes, ese es el límite
                    for j in range(max(0, i - 3), i):
                        prev_child = children[j]
@@ -200,7 +256,7 @@ class SectionDetector:
        """
        children = list(body)
        found_toc_marker = False
-        best = 69  # fallback conservador
+        best = None

        # 1. Buscar marcador TableContentEnd
        for i, child in enumerate(children):
@@ -220,7 +276,7 @@ class SectionDetector:
                    text = get_para_text(child).strip()

                    # El índice termina justo antes del primer título numerado (1., 2., etc.)
-                    if style_id in ('Title1', 'Título 1') and text:
+                    if style_id in ('Title1', 'Ttulo1', 'Título 1') and text:
                        # Verificar que parece un título de contenido (empieza con número)
                        if re.match(r'^\d+\.?\s', text) or re.match(r'^[IVXLCDM]+\.\s', text):
                            # Si está cerca del principio, ignorar (es el TOC)
@@ -235,9 +291,17 @@ class SectionDetector:
                if pPr is not None:
                    sectPr = pPr.find(q('w:sectPr'))
                    if sectPr is not None:
-                        # Después de un salto de sección suele empezar el contenido
                        log.debug("  Salto de sección en source hijo %d", i)
-                        return i + 1 if i + 1 < len(children) else i
+                        candidate = i + 1
+                        if candidate < len(children):
+                            return candidate
+                        break
+
+        # Fallback: si no se encontró nada, devolver la mitad del documento
+        # (asumiendo que el índice ocupa ~la primera mitad)
+        if best is None:
+            best = max(len(children) // 2, 10)
+            log.debug("  Fallback: contenido empieza en hijo %d (mitad del doc)", best)

        return best

@@ -428,9 +492,11 @@ def extract_source_title(source_xml: etree._Element) -> tuple[str, str]:
            if child.tag == q('w:p'):
                style_id = get_style_id(child)
                text = get_para_text(child).strip()
-                # Buscar primer título principal
-                if style_id in ('Title1', 'Título 1') and text:
-                    # Dividir título y subtítulo si están en el mismo párrafo
+                # Buscar primer título principal (no el del índice)
+                if style_id in ('Title1', 'Ttulo1', 'Título 1') and text:
+                    # Saltarse títulos que parecen del índice (muy cortos o numéricos genéricos)
+                    if re.match(r'^\d+$', text) or text in ('Índice', 'Index', 'Contents', 'Tabla de contenido'):
+                        continue
                    lines = text.split('\n')
                    title = lines[0].strip()
                    subtitle = lines[1].strip() if len(lines) > 1 else ""
@@ -447,6 +513,7 @@ def replace_content(
    source_docx_path: str | Path,
    output_path: str | Path,
    style_map: dict | None = None,
+    doc_vars: dict | None = None,
 ) -> Path:
    """
    Núcleo de la conversión: fusiona template + source en un solo documento.
@@ -484,6 +551,11 @@ def replace_content(
        if body_src is None:
            raise DocxError("El documento fuente no tiene body")

+        # ---- Rellenar campos SDT en la portada ----
+        if doc_vars:
+            filled = fill_sdt_fields(tmpl_xml, doc_vars)
+            log.info("  Campos SDT rellenados: %d", filled)
+
        children_tmpl = list(body_tmpl)
        children_src = list(body_src)

@@ -491,12 +563,23 @@ def replace_content(
        changes = remap_styles(src_xml, style_map)
        log.info("  Estilos reasignados: %d", changes)

-        # ---- Detectar límites ----
-        tmpl_idx_end = SectionDetector.find_end_of_preface(body_tmpl)
-        tmpl_back = SectionDetector.find_back_cover_start(body_tmpl)
+        # ---- Detectar límites por secciones ----
+        # El template tiene 5 secciones: portada, disclaimer, índice, CONTENIDO, contraportada
+        # Localizamos los sectPr que marcan el final de cada sección
+        tmpl_sections = _find_section_boundaries(body_tmpl)
+        # La sección 4 (índice 3) es la del contenido a reemplazar
+        if len(tmpl_sections) < 4:
+            log.warning("  Template tiene %d secciones, se esperaban al menos 4. Usando detección por estilos.", len(tmpl_sections)+1)
+            tmpl_idx_end = SectionDetector.find_end_of_preface(body_tmpl)
+            tmpl_back = SectionDetector.find_back_cover_start(body_tmpl)
+        else:
+            log.info("  Template: %d secciones detectadas", len(tmpl_sections)+1)
+            tmpl_idx_end = tmpl_sections[2]  # sectPr de sección 3 -> contenido empieza en sección 4
+            tmpl_back = tmpl_sections[3]      # sectPr de sección 4 -> contraportada empieza en sección 5
+        
        src_start = SectionDetector.find_content_start(body_src)

-        log.info("  Template: prefacio h. hijo %d, contraportada h. hijo %d", tmpl_idx_end, tmpl_back)
+        log.info("  Template: sección contenido entre hijos %d y %d", tmpl_idx_end+1, tmpl_back)
        log.info("  Source: contenido real empieza en hijo %d", src_start)

        # ---- Extraer título del source ----
@@ -534,18 +617,32 @@ def replace_content(
        for child in list(body_tmpl):
            body_tmpl.remove(child)

-        # Prefacio del template
-        for child in children_tmpl[:tmpl_idx_end + 1]:
-            body_tmpl.append(copy.deepcopy(child))
+        if len(tmpl_sections) >= 4:
+            # Método por secciones: reemplazar solo la sección 4 (contenido)
+            # Secciones 1-3: portada + disclaimer + índice
+            sec3_end = tmpl_sections[2]  # sectPr de sección 3
+            sec4_end = tmpl_sections[3]  # sectPr de sección 4
            
-        # Contenido del source (desde src_start, sin sectPr)
-        for child in children_src[src_start:]:
-            if child.tag != q('w:sectPr'):
+            for child in children_tmpl[:sec3_end + 1]:
                body_tmpl.append(copy.deepcopy(child))
            
-        # Contraportada del template
-        for child in children_tmpl[tmpl_back:]:
-            body_tmpl.append(copy.deepcopy(child))
+            # Contenido del source (desde src_start, incluimos su sectPr si tiene)
+            for child in children_src[src_start:]:
+                # Incluir sectPr del source para mantener propiedades de página
+                body_tmpl.append(copy.deepcopy(child))
+            
+            # Sección 5: contraportada (después del sectPr de sección 4)
+            for child in children_tmpl[sec4_end + 1:]:
+                body_tmpl.append(copy.deepcopy(child))
+        else:
+            # Fallback por estilos
+            for child in children_tmpl[:tmpl_idx_end + 1]:
+                body_tmpl.append(copy.deepcopy(child))
+            for child in children_src[src_start:]:
+                if child.tag != q('w:sectPr'):
+                    body_tmpl.append(copy.deepcopy(child))
+            for child in children_tmpl[tmpl_back:]:
+                body_tmpl.append(copy.deepcopy(child))

        # ---- Actualizar rIds en document.xml ----
        for blip, old_rid in collect_image_refs(tmpl_xml):
@@ -567,41 +664,48 @@ def replace_content(
            except KeyError:
                log.warning("  Imagen no encontrada en source: %s (ignorada)", old_abs)

-        # 3. Añadir relaciones de imágenes del source
+        # 3. Añadir relaciones de imágenes del source (sin duplicados)
        rel_root = parse_xml(out_data.get('word/_rels/document.xml.rels', z_tmpl.read('word/_rels/document.xml.rels')))
        
-        # Eliminar relaciones existentes que podrían colisionar
+        # Relaciones existentes en el template
        existing_rids = set()
        for rel in list(rel_root):
            rid = rel.get('Id')
            if rid:
                existing_rids.add(rid)

-        # Añadir nuevas relaciones con rIds únicos
+        # Construir mapa old_rid -> src_rel element
+        src_rel_by_rid = {}
+        for rel in src_rel:
+            rid = rel.get('Id')
+            if rid:
+                src_rel_by_rid[rid] = rel
+
+        # Añadir cada nueva relación una sola vez
        for old_rid, new_rid in rid_rename_map.items():
            if new_rid in existing_rids:
-                log.debug("  rId %s ya existe en template, se omite", new_rid)
                continue
-            for rel in src_rel:
-                if rel.get('Id') == old_rid:
-                    target = rel.get('Target', '')
-                    old_target_abs = target.replace('../', '')
-                    if not old_target_abs.startswith('word/'):
-                        old_target_abs = f'word/{old_target_abs}'
-                    new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
-                    # Asegurar que el Target sea relativo correctamente (solo media/imageN.ext)
-                    new_target = new_target_abs.replace('word/', '') if new_target_abs.startswith('word/') else new_target_abs
-                    new_rel = copy.deepcopy(rel)
-                    new_rel.set('Id', new_rid)
-                    new_rel.set('Target', new_target)
-                    rel_root.append(new_rel)
-                    existing_rids.add(new_rid)
-                    break
+            src_rel_elem = src_rel_by_rid.get(old_rid)
+            if src_rel_elem is None:
+                continue
+            
+            target = src_rel_elem.get('Target', '')
+            old_target_abs = target.replace('../', '')
+            if not old_target_abs.startswith('word/'):
+                old_target_abs = f'word/{old_target_abs}'
+            new_target_abs = image_rename_map.get(old_target_abs, old_target_abs)
+            new_target = new_target_abs.replace('word/', '') if new_target_abs.startswith('word/') else new_target_abs
+            
+            new_rel = copy.deepcopy(src_rel_elem)
+            new_rel.set('Id', new_rid)
+            new_rel.set('Target', new_target)
+            rel_root.append(new_rel)
+            existing_rids.add(new_rid)

        out_data['word/_rels/document.xml.rels'] = etree.tostring(
-            rel_root, xml_declaration=True, encoding='UTF-8', standalone=True)
+            rel_root, xml_declaration=True, encoding='UTF-8')
        out_data['word/document.xml'] = etree.tostring(
-            tmpl_xml, xml_declaration=True, encoding='UTF-8', standalone=True)
+            tmpl_xml, xml_declaration=True, encoding='UTF-8')

        # ---- Escribir ----
        with zipfile.ZipFile(str(output_path), 'w', zipfile.ZIP_DEFLATED) as zout:
@@ -623,13 +727,12 @@ def replace_content(
 def setup_logging(verbose: bool = False):
    """Configura logging con formato limpio."""
    level = logging.DEBUG if verbose else logging.INFO
-    handler = logging.StreamHandler(sys.stderr)
-    handler.setFormatter(logging.Formatter('%(message)s'))
-    log.addHandler(handler)
    log.setLevel(level)
-    # Evitar duplicados
-    if log.handlers.count(handler) > 1:
-        log.removeHandler(handler)
+    # Solo añadir handler si no tiene ninguno aún
+    if not log.handlers:
+        handler = logging.StreamHandler(sys.stderr)
+        handler.setFormatter(logging.Formatter('%(message)s'))
+        log.addHandler(handler)


 def validate_docx(path: Path, label: str) -> None:
@@ -675,7 +778,31 @@ def run_single(args) -> int:
        log.info("  🏁 Dry-run: todo correcto, no se genera nada.")
        return 0

-    replace_content(template, source, output)
+    # Preparar variables para la portada
+    doc_vars = {
+        'Cliente': args.cliente,
+        'Título': args.titulo,
+        'Asunto': args.asunto,
+        'Categoría': args.tipo,
+        'Palabras clave': args.codigo,
+    }
+
+    # 1. Cargar el XML del template
+    z_tmpl = zipfile.ZipFile(str(template), 'r')
+    tmpl_xml = parse_xml(z_tmpl.read('word/document.xml'))
+    
+    # 2. Rellenar campos SDT en la portada
+    filled = fill_sdt_fields(tmpl_xml, doc_vars)
+    log.info("  Campos SDT rellenados: %d", filled)
+    
+    # Guardar el XML modificado temporalmente para que replace_content lo use
+    # (Sugerencia: pasar el XML ya modificado a replace_content o modificar la función)
+    # Para evitar re-diseñar replace_content, vamos a inyectar el comportamiento 
+    # en la función principal.
+    z_tmpl.close()
+
+    # ... (dentro de run_single) ...
+    replace_content(template, source, output, doc_vars=doc_vars)
    return 0


@@ -815,6 +942,13 @@ Ejemplos:
        help='Archivo de salida (solo modo single)',
    )

+    # Variables de portada
+    parser.add_argument('--cliente', default='', help='Nombre del cliente')
+    parser.add_argument('--titulo', default='', help='Título del documento')
+    parser.add_argument('--asunto', default='', help='Subtítulo / asunto')
+    parser.add_argument('--tipo', default='', help='Tipo de documento')
+    parser.add_argument('--codigo', default='', help='Código de documento')
+
    return parser


@@ -2,6 +2,8 @@
 # Batch runner para apply_template.py
 # Procesa los documentos en /tmp/batch_t*.txt

+set -euo pipefail
+
 TEMPLATE="/mnt/c/Users/javie/Documents/R360MX/cloud/01. Info General/02. Standards/03. Templates/TPL01-Reports.docx"
 DIR="/home/javi/.openclaw/workspace/r360mx-docs-converter"
 LOGFILE="/tmp/r360mx_batch_$(date +%Y%m%d_%H%M%S).log"
@@ -13,6 +15,18 @@ echo "" | tee -a "$LOGFILE"
 TOTAL=0
 OK=0
 FAIL=0
+TOTAL_BASE=0
+
+# Primero contar docs totales
+for TANDA in /tmp/batch_t1.txt /tmp/batch_t2.txt /tmp/batch_t3.txt /tmp/batch_t4.txt; do
+    if [ -f "$TANDA" ]; then
+        COUNT=$(wc -l < "$TANDA")
+        TOTAL_BASE=$((TOTAL_BASE + COUNT))
+    fi
+done
+
+echo "Total documentos: $TOTAL_BASE" | tee -a "$LOGFILE"
+echo "" | tee -a "$LOGFILE"

 for TANDA in /tmp/batch_t1.txt /tmp/batch_t2.txt /tmp/batch_t3.txt /tmp/batch_t4.txt; do
    if [ ! -f "$TANDA" ]; then
@@ -20,13 +34,14 @@ for TANDA in /tmp/batch_t1.txt /tmp/batch_t2.txt /tmp/batch_t3.txt /tmp/batch_t4
    fi
    
    NUM=$(wc -l < "$TANDA")
-    echo "--- Tanda: $(basename $TANDA) ($NUM docs) ---" | tee -a "$LOGFILE"
+    echo "--- Tanda: $(basename "$TANDA") ($NUM docs) ---" | tee -a "$LOGFILE"
    
    while IFS= read -r DOC; do
        TOTAL=$((TOTAL + 1))
        echo -n "[$TOTAL/$TOTAL_BASE] $(basename "$DOC")... " | tee -a "$LOGFILE"
        
-        if cd "$DIR" && python3 apply_template.py "$DOC" "$TEMPLATE" >> "$LOGFILE" 2>&1; then
+        # Ejecutar en subshell para no alterar el directorio actual
+        if ( cd "$DIR" && python3 apply_template.py "$DOC" "$TEMPLATE" ) >> "$LOGFILE" 2>&1; then
            echo "✅" | tee -a "$LOGFILE"
            OK=$((OK + 1))
        else
Author	SHA1	Message	Date
Rufus	856c7fd4bc	Fix: remove source sectPr to prevent Word corruption and integrate SDT filling	2026-06-04 00:38:10 +02:00
Rufus	842d1ec274	Initial commit of converter	2026-06-03 19:25:03 +02:00