Removal of Ghostscript to use qpdf and tesseract directly (#2338)

* navbar fix multi tool and compress location * release notes and ghostscript removal * cleanups * formatting * update docs * more * more * docs * release bump * Hardening suggestions for Stirling-PDF / ghostscript (#2339) * Protect `readLine()` against DoS * Sanitized user-provided file names in HTTP multipart uploads --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
2024-11-26 20:50:35 +00:00
parent 654bc94d44
commit 833b3c45c6
69 changed files with 1106 additions and 665 deletions
--- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java
@@ -1,12 +1,13 @@
 package stirling.software.SPDF.controller.api.converters;

-import java.io.FileOutputStream;
-import java.io.OutputStream;
+import java.io.File;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;

+import org.apache.commons.io.FileUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.http.MediaType;
@@ -37,59 +38,90 @@ public class ConvertPDFToPDFA {
    @Operation(
            summary = "Convert a PDF to a PDF/A",
            description =
-                    "This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
+                    "This endpoint converts a PDF file to a PDF/A file using LibreOffice. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
    public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PdfToPdfARequest request)
            throws Exception {
        MultipartFile inputFile = request.getFileInput();
        String outputFormat = request.getOutputFormat();

-        // Convert MultipartFile to byte[]
-        byte[] pdfBytes = inputFile.getBytes();
-
-        // Save the uploaded file to a temporary location
-        Path tempInputFile = Files.createTempFile("input_", ".pdf");
-        try (OutputStream outputStream = new FileOutputStream(tempInputFile.toFile())) {
-            outputStream.write(pdfBytes);
+        // Validate input file type
+        if (!"application/pdf".equals(inputFile.getContentType())) {
+            logger.error("Invalid input file type: {}", inputFile.getContentType());
+            throw new IllegalArgumentException("Input file must be a PDF");
        }

-        // Prepare the output file path
-        Path tempOutputFile = Files.createTempFile("output_", ".pdf");
-
-        // Prepare the ghostscript command
-        List<String> command = new ArrayList<>();
-        command.add("gs");
-        command.add("-dPDFA=" + ("pdfa".equals(outputFormat) ? "2" : "1"));
-        command.add("-dNOPAUSE");
-        command.add("-dBATCH");
-        command.add("-sColorConversionStrategy=sRGB");
-        command.add("-sDEVICE=pdfwrite");
-        command.add("-dPDFACompatibilityPolicy=2");
-        command.add("-o");
-        command.add(tempOutputFile.toString());
-        command.add(tempInputFile.toString());
-
-        ProcessExecutorResult returnCode =
-                ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
-                        .runCommandWithOutputHandling(command);
-
-        if (returnCode.getRc() != 0) {
-            logger.info(
-                    outputFormat + " conversion failed with return code: " + returnCode.getRc());
+        // Get the original filename without extension
+        String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
+        if (originalFileName == null || originalFileName.trim().isEmpty()) {
+            originalFileName = "output.pdf";
        }
+        String baseFileName =
+                originalFileName.contains(".")
+                        ? originalFileName.substring(0, originalFileName.lastIndexOf('.'))
+                        : originalFileName;
+
+        Path tempInputFile = null;
+        Path tempOutputDir = null;
+        byte[] fileBytes;

        try {
-            byte[] pdfBytesOutput = Files.readAllBytes(tempOutputFile);
-            // Return the optimized PDF as a response
-            String outputFilename =
-                    Filenames.toSimpleFileName(inputFile.getOriginalFilename())
-                                    .replaceFirst("[.][^.]+$", "")
-                            + "_PDFA.pdf";
+            // Save uploaded file to temp location
+            tempInputFile = Files.createTempFile("input_", ".pdf");
+            inputFile.transferTo(tempInputFile);
+
+            // Create temp output directory
+            tempOutputDir = Files.createTempDirectory("output_");
+
+            // Determine PDF/A filter based on requested format
+            String pdfFilter =
+                    "pdfa".equals(outputFormat)
+                            ? "writer_pdf_Export:{'SelectPdfVersion':{'Value':'2'}}:writer_pdf_Export"
+                            : "writer_pdf_Export:{'SelectPdfVersion':{'Value':'1'}}:writer_pdf_Export";
+
+            // Prepare LibreOffice command
+            List<String> command =
+                    new ArrayList<>(
+                            Arrays.asList(
+                                    "soffice",
+                                    "--headless",
+                                    "--nologo",
+                                    "--convert-to",
+                                    "pdf:" + pdfFilter,
+                                    "--outdir",
+                                    tempOutputDir.toString(),
+                                    tempInputFile.toString()));
+
+            ProcessExecutorResult returnCode =
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
+                            .runCommandWithOutputHandling(command);
+
+            if (returnCode.getRc() != 0) {
+                logger.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
+                throw new RuntimeException("PDF/A conversion failed");
+            }
+
+            // Get the output file
+            File[] outputFiles = tempOutputDir.toFile().listFiles();
+            if (outputFiles == null || outputFiles.length != 1) {
+                throw new RuntimeException(
+                        "Expected exactly one output file but found "
+                                + (outputFiles == null ? "none" : outputFiles.length));
+            }
+
+            fileBytes = FileUtils.readFileToByteArray(outputFiles[0]);
+            String outputFilename = baseFileName + "_PDFA.pdf";
+
            return WebResponseUtils.bytesToWebResponse(
-                    pdfBytesOutput, outputFilename, MediaType.APPLICATION_PDF);
+                    fileBytes, outputFilename, MediaType.APPLICATION_PDF);
+
        } finally {
-            // Clean up the temporary files
-            Files.deleteIfExists(tempInputFile);
-            Files.deleteIfExists(tempOutputFile);
+            // Clean up temporary files
+            if (tempInputFile != null) {
+                Files.deleteIfExists(tempInputFile);
+            }
+            if (tempOutputDir != null) {
+                FileUtils.deleteDirectory(tempOutputDir.toFile());
+            }
        }
    }
 }
--- a/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java
@@ -20,7 +20,7 @@ import org.springframework.web.bind.annotation.RestController;

 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
-import stirling.software.SPDF.controller.api.CropController;
+
 import stirling.software.SPDF.model.api.extract.PDFFilePage;
 import stirling.software.SPDF.pdf.FlexibleCSVWriter;
 import technology.tabula.ObjectExtractor;
@@ -37,11 +37,15 @@ public class ExtractCSVController {
    private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);

    @PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
-    @Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
+    @Operation(
+            summary = "Extracts a CSV document from a PDF",
+            description =
+                    "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
    public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
        StringWriter writer = new StringWriter();
        try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
-            CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
+            CSVFormat format =
+                    CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
            Writer csvWriter = new FlexibleCSVWriter(format);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            try (ObjectExtractor extractor = new ObjectExtractor(document)) {
@@ -56,8 +60,8 @@ public class ExtractCSVController {
                ContentDisposition.builder("attachment")
                        .filename(
                                form.getFileInput()
-                                        .getOriginalFilename()
-                                        .replaceFirst("[.][^.]+$", "")
+                                                .getOriginalFilename()
+                                                .replaceFirst("[.][^.]+$", "")
                                        + "_extracted.csv")
                        .build());
        headers.setContentType(MediaType.parseMediaType("text/csv"));