fix: switch to pdftohtml for pdf to html conversions (#998)

* fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml
2024-03-29 17:02:33 -04:00
parent 27bbf7a513
commit dfb8c64f5a
37 changed files with 101 additions and 58 deletions
@@ -244,6 +244,6 @@ public class EndpointConfiguration {
            }
        }
    }
-    
+
    private static final String REMOVE_BLANKS = "remove-blanks";
 }
@@ -291,6 +291,6 @@ public class UserController {
        }
        return ResponseEntity.ok(apiKey);
    }
-    
+
    private static final String LOGIN_MESSAGETYPE_CREDSUPDATED = "/login?messageType=credsUpdated";
 }
@@ -29,18 +29,6 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs")
 public class ConvertPDFToOffice {

-    @PostMapping(consumes = "multipart/form-data", value = "/pdf/html")
-    @Operation(
-            summary = "Convert PDF to HTML",
-            description =
-                    "This endpoint converts a PDF file to HTML format. Input:PDF Output:HTML Type:SISO")
-    public ResponseEntity<byte[]> processPdfToHTML(@ModelAttribute PDFFile request)
-            throws Exception {
-        MultipartFile inputFile = request.getFileInput();
-        PDFToFile pdfToFile = new PDFToFile();
-        return pdfToFile.processPdfToOfficeFormat(inputFile, "html", "writer_pdf_import");
-    }
-
    @PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
    @Operation(
            summary = "Convert PDF to Presentation format",
@@ -6,8 +6,6 @@ import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;

-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.beans.factory.annotation.Qualifier;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.ModelAttribute;
 import org.springframework.web.bind.annotation.PostMapping;
@@ -219,6 +219,6 @@ public class ExtractImageScansController {
                    });
        }
    }
-    
+
    private static final String REPLACEFIRST = "[.][^.]+$";
 }
@@ -26,7 +26,6 @@ import org.springframework.stereotype.Service;

 import com.fasterxml.jackson.databind.ObjectMapper;

-import stirling.software.SPDF.model.ApplicationProperties;
 import stirling.software.SPDF.model.PipelineConfig;
 import stirling.software.SPDF.model.PipelineOperation;

@@ -3,8 +3,6 @@ package stirling.software.SPDF.repository;
 import java.util.Optional;

 import org.springframework.data.jpa.repository.JpaRepository;
-import org.springframework.data.jpa.repository.Query;
-import org.springframework.data.repository.query.Param;

 import stirling.software.SPDF.model.User;

@@ -25,6 +25,71 @@ import io.github.pixee.security.Filenames;
 import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;

 public class PDFToFile {
+
+    public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
+            throws IOException, InterruptedException {
+        if (!"application/pdf".equals(inputFile.getContentType())) {
+            return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
+        }
+
+        // Get the original PDF file name without the extension
+        String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
+        String pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
+
+        Path tempInputFile = null;
+        Path tempOutputDir = null;
+        byte[] fileBytes;
+        String fileName = "temp.file";
+
+        try {
+            // Save the uploaded file to a temporary location
+            tempInputFile = Files.createTempFile("input_", ".pdf");
+            Files.copy(
+                    inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
+
+            // Prepare the output directory
+            tempOutputDir = Files.createTempDirectory("output_");
+
+            // Run the pdftohtml command with complex output
+            List<String> command =
+                    new ArrayList<>(
+                            Arrays.asList(
+                                    "pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
+
+            ProcessExecutorResult returnCode =
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
+                            .runCommandWithOutputHandling(command, tempOutputDir.toFile());
+
+            // Get output files
+            List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
+
+            // Return output files in a ZIP archive
+            fileName = pdfBaseName + "ToHtml.zip";
+            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
+            ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream);
+
+            for (File outputFile : outputFiles) {
+                ZipEntry entry = new ZipEntry(outputFile.getName());
+                zipOutputStream.putNextEntry(entry);
+                FileInputStream fis = new FileInputStream(outputFile);
+                IOUtils.copy(fis, zipOutputStream);
+                fis.close();
+                zipOutputStream.closeEntry();
+            }
+
+            zipOutputStream.close();
+            fileBytes = byteArrayOutputStream.toByteArray();
+
+        } finally {
+            // Clean up the temporary files
+            if (tempInputFile != null) Files.delete(tempInputFile);
+            if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
+        }
+
+        return WebResponseUtils.bytesToWebResponse(
+                fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
+    }
+
    public ResponseEntity<byte[]> processPdfToOfficeFormat(
            MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
            throws IOException, InterruptedException {
@@ -39,17 +104,7 @@ public class PDFToFile {

        // Validate output format
        List<String> allowedFormats =
-                Arrays.asList(
-                        "doc",
-                        "docx",
-                        "odt",
-                        "ppt",
-                        "pptx",
-                        "odp",
-                        "rtf",
-                        "html",
-                        "xml",
-                        "txt:Text");
+                Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
        if (!allowedFormats.contains(outputFormat)) {
            return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
        }
@@ -24,6 +24,7 @@ public class ProcessExecutor {

    public enum Processes {
        LIBRE_OFFICE,
+        PDFTOHTML,
        OCR_MY_PDF,
        PYTHON_OPENCV,
        GHOSTSCRIPT,
@@ -45,6 +46,7 @@ public class ProcessExecutor {
                    int semaphoreLimit =
                            switch (key) {
                                case LIBRE_OFFICE -> 1;
+                                case PDFTOHTML -> 1;
                                case OCR_MY_PDF -> 2;
                                case PYTHON_OPENCV -> 8;
                                case GHOSTSCRIPT -> 16;
@@ -56,6 +58,7 @@ public class ProcessExecutor {
                    long timeoutMinutes =
                            switch (key) {
                                case LIBRE_OFFICE -> 30;
+                                case PDFTOHTML -> 5;
                                case OCR_MY_PDF -> 30;
                                case PYTHON_OPENCV -> 30;
                                case GHOSTSCRIPT -> 5;