Removal of Ghostscript to use qpdf and tesseract directly (#2338)

* navbar fix multi tool and compress location * release notes and ghostscript removal * cleanups * formatting * update docs * more * more * docs * release bump * Hardening suggestions for Stirling-PDF / ghostscript (#2339) * Protect `readLine()` against DoS * Sanitized user-provided file names in HTTP multipart uploads --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
2024-11-26 20:50:35 +00:00
parent 654bc94d44
commit 833b3c45c6
69 changed files with 1106 additions and 665 deletions
--- a/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/misc/CompressController.java
@@ -10,7 +10,6 @@ import java.util.List;

 import javax.imageio.ImageIO;

-import org.apache.commons.io.FileUtils;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.PDDocument;
@@ -53,6 +52,54 @@ public class CompressController {
        this.pdfDocumentFactory = pdfDocumentFactory;
    }

+    private void compressImagesInPDF(Path pdfFile, double initialScaleFactor) throws Exception {
+        byte[] fileBytes = Files.readAllBytes(pdfFile);
+        try (PDDocument doc = Loader.loadPDF(fileBytes)) {
+            double scaleFactor = initialScaleFactor;
+
+            for (PDPage page : doc.getPages()) {
+                PDResources res = page.getResources();
+                if (res != null && res.getXObjectNames() != null) {
+                    for (COSName name : res.getXObjectNames()) {
+                        PDXObject xobj = res.getXObject(name);
+                        if (xobj instanceof PDImageXObject) {
+                            PDImageXObject image = (PDImageXObject) xobj;
+                            BufferedImage bufferedImage = image.getImage();
+
+                            int newWidth = (int) (bufferedImage.getWidth() * scaleFactor);
+                            int newHeight = (int) (bufferedImage.getHeight() * scaleFactor);
+
+                            if (newWidth == 0 || newHeight == 0) {
+                                continue;
+                            }
+
+                            Image scaledImage =
+                                    bufferedImage.getScaledInstance(
+                                            newWidth, newHeight, Image.SCALE_SMOOTH);
+
+                            BufferedImage scaledBufferedImage =
+                                    new BufferedImage(
+                                            newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
+                            scaledBufferedImage.getGraphics().drawImage(scaledImage, 0, 0, null);
+
+                            ByteArrayOutputStream compressedImageStream =
+                                    new ByteArrayOutputStream();
+                            ImageIO.write(scaledBufferedImage, "jpeg", compressedImageStream);
+                            byte[] imageBytes = compressedImageStream.toByteArray();
+                            compressedImageStream.close();
+
+                            PDImageXObject compressedImage =
+                                    PDImageXObject.createFromByteArray(
+                                            doc, imageBytes, image.getCOSObject().toString());
+                            res.put(name, compressedImage);
+                        }
+                    }
+                }
+            }
+            doc.save(pdfFile.toString());
+        }
+    }
+
    @PostMapping(consumes = "multipart/form-data", value = "/compress-pdf")
    @Operation(
            summary = "Optimize PDF file",
@@ -75,209 +122,92 @@ public class CompressController {
            autoMode = true;
        }

-        // Save the uploaded file to a temporary location
        Path tempInputFile = Files.createTempFile("input_", ".pdf");
        inputFile.transferTo(tempInputFile.toFile());

        long inputFileSize = Files.size(tempInputFile);

-        // Prepare the output file path
-
        Path tempOutputFile = null;
        byte[] pdfBytes;
        try {
            tempOutputFile = Files.createTempFile("output_", ".pdf");
-            // Determine initial optimization level based on expected size reduction, only if in
-            // autoMode
+
            if (autoMode) {
                double sizeReductionRatio = expectedOutputSize / (double) inputFileSize;
-                if (sizeReductionRatio > 0.7) {
-                    optimizeLevel = 1;
-                } else if (sizeReductionRatio > 0.5) {
-                    optimizeLevel = 2;
-                } else if (sizeReductionRatio > 0.35) {
-                    optimizeLevel = 3;
-                } else {
-                    optimizeLevel = 3;
-                }
+                optimizeLevel = determineOptimizeLevel(sizeReductionRatio);
            }

            boolean sizeMet = false;
-            while (!sizeMet && optimizeLevel <= 4) {
-                // Prepare the Ghostscript command
-                List<String> command = new ArrayList<>();
-                command.add("gs");
-                command.add("-sDEVICE=pdfwrite");
-                command.add("-dCompatibilityLevel=1.5");
+            while (!sizeMet && optimizeLevel <= 9) {

-                switch (optimizeLevel) {
-                    case 1:
-                        command.add("-dPDFSETTINGS=/prepress");
-                        break;
-                    case 2:
-                        command.add("-dPDFSETTINGS=/printer");
-                        break;
-                    case 3:
-                        command.add("-dPDFSETTINGS=/ebook");
-                        break;
-                    case 4:
-                        command.add("-dPDFSETTINGS=/screen");
-                        break;
-                    default:
-                        command.add("-dPDFSETTINGS=/default");
+                // Apply additional image compression for levels 6-9
+                if (optimizeLevel >= 6) {
+                    // Calculate scale factor based on optimization level
+                    double scaleFactor =
+                            switch (optimizeLevel) {
+                                case 6 -> 0.9; // 90% of original size
+                                case 7 -> 0.8; // 80% of original size
+                                case 8 -> 0.65; // 70% of original size
+                                case 9 -> 0.5; // 60% of original size
+                                default -> 1.0;
+                            };
+                    compressImagesInPDF(tempInputFile, scaleFactor);
                }

-                command.add("-dNOPAUSE");
-                command.add("-dQUIET");
-                command.add("-dBATCH");
-                command.add("-sOutputFile=" + tempOutputFile.toString());
+                // Run QPDF optimization
+                List<String> command = new ArrayList<>();
+                command.add("qpdf");
+                if (request.getNormalize()) {
+                    command.add("--normalize-content=y");
+                }
+                if (request.getLinearize()) {
+                    command.add("--linearize");
+                }
+                command.add("--optimize-images");
+                command.add("--recompress-flate");
+                command.add("--compression-level=" + optimizeLevel);
+                command.add("--compress-streams=y");
+                command.add("--object-streams=generate");
                command.add(tempInputFile.toString());
+                command.add(tempOutputFile.toString());

-                ProcessExecutorResult returnCode =
-                        ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
-                                .runCommandWithOutputHandling(command);
+                ProcessExecutorResult returnCode = null;
+                try {
+                    returnCode =
+                            ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
+                                    .runCommandWithOutputHandling(command);
+                } catch (Exception e) {
+                    if (returnCode != null && returnCode.getRc() != 3) {
+                        throw e;
+                    }
+                }

-                // Check if file size is within expected size or not auto mode so instantly finish
+                // Check if file size is within expected size or not auto mode
                long outputFileSize = Files.size(tempOutputFile);
                if (outputFileSize <= expectedOutputSize || !autoMode) {
                    sizeMet = true;
                } else {
-                    // Increase optimization level for next iteration
-                    optimizeLevel++;
-                    if (autoMode && optimizeLevel > 4) {
-                        logger.info("Skipping level 5 due to bad results in auto mode");
+                    optimizeLevel =
+                            incrementOptimizeLevel(
+                                    optimizeLevel, outputFileSize, expectedOutputSize);
+                    if (autoMode && optimizeLevel > 9) {
+                        logger.info("Maximum compression level reached in auto mode");
                        sizeMet = true;
-                    } else {
-                        logger.info(
-                                "Increasing ghostscript optimisation level to " + optimizeLevel);
                    }
                }
            }

-            if (expectedOutputSize != null && autoMode) {
-                long outputFileSize = Files.size(tempOutputFile);
-                byte[] fileBytes = Files.readAllBytes(tempOutputFile);
-                if (outputFileSize > expectedOutputSize) {
-                    try (PDDocument doc = Loader.loadPDF(fileBytes)) {
-                        long previousFileSize = 0;
-                        double scaleFactorConst = 0.9f;
-                        double scaleFactor = 0.9f;
-                        while (true) {
-                            for (PDPage page : doc.getPages()) {
-                                PDResources res = page.getResources();
-                                if (res != null && res.getXObjectNames() != null) {
-                                    for (COSName name : res.getXObjectNames()) {
-                                        PDXObject xobj = res.getXObject(name);
-                                        if (xobj != null && xobj instanceof PDImageXObject) {
-                                            PDImageXObject image = (PDImageXObject) xobj;
-
-                                            // Get the image in BufferedImage format
-                                            BufferedImage bufferedImage = image.getImage();
-
-                                            // Calculate the new dimensions
-                                            int newWidth =
-                                                    (int)
-                                                            (bufferedImage.getWidth()
-                                                                    * scaleFactorConst);
-                                            int newHeight =
-                                                    (int)
-                                                            (bufferedImage.getHeight()
-                                                                    * scaleFactorConst);
-
-                                            // If the new dimensions are zero, skip this iteration
-                                            if (newWidth == 0 || newHeight == 0) {
-                                                continue;
-                                            }
-
-                                            // Otherwise, proceed with the scaling
-                                            Image scaledImage =
-                                                    bufferedImage.getScaledInstance(
-                                                            newWidth,
-                                                            newHeight,
-                                                            Image.SCALE_SMOOTH);
-
-                                            // Convert the scaled image back to a BufferedImage
-                                            BufferedImage scaledBufferedImage =
-                                                    new BufferedImage(
-                                                            newWidth,
-                                                            newHeight,
-                                                            BufferedImage.TYPE_INT_RGB);
-                                            scaledBufferedImage
-                                                    .getGraphics()
-                                                    .drawImage(scaledImage, 0, 0, null);
-
-                                            // Compress the scaled image
-                                            ByteArrayOutputStream compressedImageStream =
-                                                    new ByteArrayOutputStream();
-                                            ImageIO.write(
-                                                    scaledBufferedImage,
-                                                    "jpeg",
-                                                    compressedImageStream);
-                                            byte[] imageBytes = compressedImageStream.toByteArray();
-                                            compressedImageStream.close();
-
-                                            PDImageXObject compressedImage =
-                                                    PDImageXObject.createFromByteArray(
-                                                            doc,
-                                                            imageBytes,
-                                                            image.getCOSObject().toString());
-
-                                            // Replace the image in the resources with the
-                                            // compressed
-                                            // version
-                                            res.put(name, compressedImage);
-                                        }
-                                    }
-                                }
-                            }
-
-                            // save the document to tempOutputFile again
-                            doc.save(tempOutputFile.toString());
-
-                            long currentSize = Files.size(tempOutputFile);
-                            // Check if the overall PDF size is still larger than expectedOutputSize
-                            if (currentSize > expectedOutputSize) {
-                                // Log the current file size and scaleFactor
-
-                                logger.info(
-                                        "Current file size: "
-                                                + FileUtils.byteCountToDisplaySize(currentSize));
-                                logger.info("Current scale factor: " + scaleFactor);
-
-                                // The file is still too large, reduce scaleFactor and try again
-                                scaleFactor *= 0.9f; // reduce scaleFactor by 10%
-                                // Avoid scaleFactor being too small, causing the image to shrink to
-                                // 0
-                                if (scaleFactor < 0.2f || previousFileSize == currentSize) {
-                                    throw new RuntimeException(
-                                            "Could not reach the desired size without excessively degrading image quality, lowest size recommended is "
-                                                    + FileUtils.byteCountToDisplaySize(currentSize)
-                                                    + ", "
-                                                    + currentSize
-                                                    + " bytes");
-                                }
-                                previousFileSize = currentSize;
-                            } else {
-                                // The file is small enough, break the loop
-                                break;
-                            }
-                        }
-                    }
-                }
-            }
            // Read the optimized PDF file
            pdfBytes = Files.readAllBytes(tempOutputFile);
            Path finalFile = tempOutputFile;
+
            // Check if optimized file is larger than the original
            if (pdfBytes.length > inputFileSize) {
-                // Log the occurrence
                logger.warn(
                        "Optimized file is larger than the original. Returning the original file instead.");
-
-                // Read the original file again
                finalFile = tempInputFile;
            }
-            // Return the optimized PDF as a response
+
            String outputFilename =
                    Filenames.toSimpleFileName(inputFile.getOriginalFilename())
                                    .replaceFirst("[.][^.]+$", "")
@@ -286,10 +216,31 @@ public class CompressController {
                    pdfDocumentFactory.load(finalFile.toFile()), outputFilename);

        } finally {
-            // Clean up the temporary files
-            // deleted by multipart file handler deu to transferTo?
-            // Files.deleteIfExists(tempInputFile);
            Files.deleteIfExists(tempOutputFile);
        }
    }
+
+    private int determineOptimizeLevel(double sizeReductionRatio) {
+        if (sizeReductionRatio > 0.9) return 1;
+        if (sizeReductionRatio > 0.8) return 2;
+        if (sizeReductionRatio > 0.7) return 3;
+        if (sizeReductionRatio > 0.6) return 4;
+        if (sizeReductionRatio > 0.5) return 5;
+        if (sizeReductionRatio > 0.4) return 6;
+        if (sizeReductionRatio > 0.3) return 7;
+        if (sizeReductionRatio > 0.2) return 8;
+        return 9;
+    }
+
+    private int incrementOptimizeLevel(int currentLevel, long currentSize, long targetSize) {
+        double currentRatio = currentSize / (double) targetSize;
+        logger.info("Current compression ratio: {}", String.format("%.2f", currentRatio));
+
+        if (currentRatio > 2.0) {
+            return Math.min(9, currentLevel + 3);
+        } else if (currentRatio > 1.5) {
+            return Math.min(9, currentLevel + 2);
+        }
+        return Math.min(9, currentLevel + 1);
+    }
 }
--- a/src/main/java/stirling/software/SPDF/controller/api/misc/FakeScanControllerWIP.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/misc/FakeScanControllerWIP.java
@@ -58,7 +58,7 @@ public class FakeScanControllerWIP {
    @Operation(
            summary = "Repair a PDF file",
            description =
-                    "This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
+                    "This endpoint repairs a given PDF file by running qpdf command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
    public ResponseEntity<byte[]> fakeScan(@ModelAttribute PDFFile request) throws IOException {
        MultipartFile inputFile = request.getFileInput();

--- a/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java
@@ -1,19 +1,31 @@
 package stirling.software.SPDF.controller.api.misc;

-import java.io.ByteArrayInputStream;
+import io.github.pixee.security.BoundedLineReader;
+import io.github.pixee.security.Filenames;
+import java.awt.image.BufferedImage;
+import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileOutputStream;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipOutputStream;

+import javax.imageio.ImageIO;
+
+import org.apache.pdfbox.multipdf.PDFMergerUtility;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
@@ -23,24 +35,29 @@ import org.springframework.web.bind.annotation.RequestMapping;
 import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.multipart.MultipartFile;

-import io.github.pixee.security.Filenames;
-import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;

+import lombok.extern.slf4j.Slf4j;
 import stirling.software.SPDF.model.ApplicationProperties;
 import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
 import stirling.software.SPDF.service.CustomPDDocumentFactory;
-import stirling.software.SPDF.utils.ProcessExecutor;
-import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
-import stirling.software.SPDF.utils.WebResponseUtils;

@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
+@Slf4j
 public class OCRController {

-    @Autowired ApplicationProperties applicationProperties;
+    @Autowired private ApplicationProperties applicationProperties;

+    private final CustomPDDocumentFactory pdfDocumentFactory;
+
+    @Autowired
+    public OCRController(CustomPDDocumentFactory pdfDocumentFactory) {
+        this.pdfDocumentFactory = pdfDocumentFactory;
+    }
+
+    /** Gets the list of available Tesseract languages from the tessdata directory */
    public List<String> getAvailableTesseractLanguages() {
        String tessdataDir = applicationProperties.getSystem().getTessdataDir();
        File[] files = new File(tessdataDir).listFiles();
@@ -54,196 +71,161 @@ public class OCRController {
                .collect(Collectors.toList());
    }

-    private final CustomPDDocumentFactory pdfDocumentFactory;
-
-    @Autowired
-    public OCRController(CustomPDDocumentFactory pdfDocumentFactory) {
-        this.pdfDocumentFactory = pdfDocumentFactory;
-    }
-
    @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
-    @Operation(
-            summary = "Process a PDF file with OCR",
-            description =
-                    "This endpoint processes a PDF file using OCR (Optical Character Recognition). Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. Input:PDF Output:PDF Type:SI-Conditional")
    public ResponseEntity<byte[]> processPdfWithOCR(
            @ModelAttribute ProcessPdfWithOcrRequest request)
            throws IOException, InterruptedException {
        MultipartFile inputFile = request.getFileInput();
-        List<String> selectedLanguages = request.getLanguages();
-        Boolean sidecar = request.isSidecar();
-        Boolean deskew = request.isDeskew();
-        Boolean clean = request.isClean();
-        Boolean cleanFinal = request.isCleanFinal();
+        List<String> languages = request.getLanguages();
        String ocrType = request.getOcrType();
-        String ocrRenderType = request.getOcrRenderType();
-        Boolean removeImagesAfter = request.isRemoveImagesAfter();
-        // --output-type pdfa
-        if (selectedLanguages == null || selectedLanguages.isEmpty()) {
-            throw new IOException("Please select at least one language.");
-        }

-        if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
-            throw new IOException("ocrRenderType wrong");
-        }
+        Path tempDir = Files.createTempDirectory("ocr_process");
+        Path tempInputFile = tempDir.resolve("input.pdf");
+        Path tempOutputDir = tempDir.resolve("output");
+        Path tempImagesDir = tempDir.resolve("images");
+        Path finalOutputFile = tempDir.resolve("final_output.pdf");

-        // Get available Tesseract languages
-        List<String> availableLanguages = getAvailableTesseractLanguages();
-
-        // Validate selected languages
-        selectedLanguages =
-                selectedLanguages.stream().filter(availableLanguages::contains).toList();
-
-        if (selectedLanguages.isEmpty()) {
-            throw new IOException("None of the selected languages are valid.");
-        }
-        // Save the uploaded file to a temporary location
-        Path tempInputFile = Files.createTempFile("input_", ".pdf");
-        Path tempOutputFile = Files.createTempFile("output_", ".pdf");
-        Path sidecarTextPath = null;
+        Files.createDirectories(tempOutputDir);
+        Files.createDirectories(tempImagesDir);

        try {
+            // Save input file
            inputFile.transferTo(tempInputFile.toFile());
+            PDFMergerUtility merger = new PDFMergerUtility();
+            merger.setDestinationFileName(finalOutputFile.toString());

-            // Run OCR Command
-            String languageOption = String.join("+", selectedLanguages);
+            try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
+                PDFRenderer pdfRenderer = new PDFRenderer(document);
+                int pageCount = document.getNumberOfPages();

-            List<String> command =
-                    new ArrayList<>(
-                            Arrays.asList(
-                                    "ocrmypdf",
-                                    "--verbose",
-                                    "2",
-                                    "--output-type",
-                                    "pdf",
-                                    "--pdf-renderer",
-                                    ocrRenderType));
+                for (int pageNum = 0; pageNum < pageCount; pageNum++) {
+                    PDPage page = document.getPage(pageNum);
+                    boolean hasText = false;

-            if (sidecar != null && sidecar) {
-                sidecarTextPath = Files.createTempFile("sidecar", ".txt");
-                command.add("--sidecar");
-                command.add(sidecarTextPath.toString());
-            }
+                    // Check for existing text
+                    try (PDDocument tempDoc = new PDDocument()) {
+                        tempDoc.addPage(page);
+                        PDFTextStripper stripper = new PDFTextStripper();
+                        hasText = !stripper.getText(tempDoc).trim().isEmpty();
+                    }

-            if (deskew != null && deskew) {
-                command.add("--deskew");
-            }
-            if (clean != null && clean) {
-                command.add("--clean");
-            }
-            if (cleanFinal != null && cleanFinal) {
-                command.add("--clean-final");
-            }
-            if (ocrType != null && !"".equals(ocrType)) {
-                if ("skip-text".equals(ocrType)) {
-                    command.add("--skip-text");
-                } else if ("force-ocr".equals(ocrType)) {
-                    command.add("--force-ocr");
-                } else if ("Normal".equals(ocrType)) {
+                    boolean shouldOcr =
+                            switch (ocrType) {
+                                case "skip-text" -> !hasText;
+                                case "force-ocr" -> true;
+                                default -> true;
+                            };

-                }
-            }
+                    Path pageOutputPath =
+                            tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));

-            command.addAll(
-                    Arrays.asList(
-                            "--language",
-                            languageOption,
-                            tempInputFile.toString(),
-                            tempOutputFile.toString()));
+                    if (shouldOcr) {
+                        // Convert page to image
+                        BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
+                        Path imagePath =
+                                tempImagesDir.resolve(String.format("page_%d.png", pageNum));
+                        ImageIO.write(image, "png", imagePath.toFile());

-            // Run CLI command
-            ProcessExecutorResult result =
-                    ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
-                            .runCommandWithOutputHandling(command);
-            if (result.getRc() != 0
-                    && result.getMessages().contains("multiprocessing/synchronize.py")
-                    && result.getMessages()
-                            .contains("OSError: [Errno 38] Function not implemented")) {
-                command.add("--jobs");
-                command.add("1");
-                result =
-                        ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
-                                .runCommandWithOutputHandling(command);
-            }
+                        // Build OCR command
+                        List<String> command = new ArrayList<>();
+                        command.add("tesseract");
+                        command.add(imagePath.toString());
+                        command.add(
+                                tempOutputDir
+                                        .resolve(String.format("page_%d", pageNum))
+                                        .toString());
+                        command.add("-l");
+                        command.add(String.join("+", languages));
+                        command.add("pdf"); // Always output PDF

-            // Remove images from the OCR processed PDF if the flag is set to true
-            if (removeImagesAfter != null && removeImagesAfter) {
-                Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
+                        ProcessBuilder pb = new ProcessBuilder(command);
+                        Process process = pb.start();

-                List<String> gsCommand =
-                        Arrays.asList(
-                                "gs",
-                                "-sDEVICE=pdfwrite",
-                                "-dFILTERIMAGE",
-                                "-o",
-                                tempPdfWithoutImages.toString(),
-                                tempOutputFile.toString());
+                        // Capture any error output
+                        try (BufferedReader reader =
+                                new BufferedReader(
+                                        new InputStreamReader(process.getErrorStream()))) {
+                            String line;
+                            while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) {
+                                log.debug("Tesseract: {}", line);
+                            }
+                        }

-                ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
-                        .runCommandWithOutputHandling(gsCommand);
-                tempOutputFile = tempPdfWithoutImages;
-            }
-            // Read the OCR processed PDF file
-            byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.toFile());
+                        int exitCode = process.waitFor();
+                        if (exitCode != 0) {
+                            throw new RuntimeException(
+                                    "Tesseract failed with exit code: " + exitCode);
+                        }

-            // Return the OCR processed PDF as a response
-            String outputFilename =
-                    Filenames.toSimpleFileName(inputFile.getOriginalFilename())
-                                    .replaceFirst("[.][^.]+$", "")
-                            + "_OCR.pdf";
-
-            if (sidecar != null && sidecar) {
-                // Create a zip file containing both the PDF and the text file
-                String outputZipFilename =
-                        Filenames.toSimpleFileName(inputFile.getOriginalFilename())
-                                        .replaceFirst("[.][^.]+$", "")
-                                + "_OCR.zip";
-                Path tempZipFile = Files.createTempFile("output_", ".zip");
-
-                try (ZipOutputStream zipOut =
-                        new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
-                    // Add PDF file to the zip
-                    ZipEntry pdfEntry = new ZipEntry(outputFilename);
-                    zipOut.putNextEntry(pdfEntry);
-                    try (ByteArrayInputStream pdfInputStream = new ByteArrayInputStream(pdfBytes)) {
-                        byte[] buffer = new byte[1024];
-                        int length;
-                        while ((length = pdfInputStream.read(buffer)) != -1) {
-                            zipOut.write(buffer, 0, length);
+                        // Add OCR'd PDF to merger
+                        merger.addSource(pageOutputPath.toFile());
+                    } else {
+                        // Save original page without OCR
+                        try (PDDocument pageDoc = new PDDocument()) {
+                            pageDoc.addPage(page);
+                            pageDoc.save(pageOutputPath.toFile());
+                            merger.addSource(pageOutputPath.toFile());
                        }
                    }
-                    zipOut.closeEntry();
-
-                    // Add text file to the zip
-                    ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
-                    zipOut.putNextEntry(txtEntry);
-                    Files.copy(sidecarTextPath, zipOut);
-                    zipOut.closeEntry();
                }
-
-                byte[] zipBytes = Files.readAllBytes(tempZipFile);
-
-                // Clean up the temporary zip file
-                Files.deleteIfExists(tempZipFile);
-                Files.deleteIfExists(tempOutputFile);
-                Files.deleteIfExists(sidecarTextPath);
-
-                // Return the zip file containing both the PDF and the text file
-                return WebResponseUtils.bytesToWebResponse(
-                        zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
-            } else {
-                // Return the OCR processed PDF as a response
-                Files.deleteIfExists(tempOutputFile);
-                return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
            }
+
+            // Merge all pages into final PDF
+            merger.mergeDocuments(null);
+
+            // Read the final PDF file
+            byte[] pdfContent = Files.readAllBytes(finalOutputFile);
+            String outputFilename =
+                    Filenames.toSimpleFileName(inputFile.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
+
+            return ResponseEntity.ok()
+                    .header(
+                            "Content-Disposition",
+                            "attachment; filename=\"" + outputFilename + "\"")
+                    .contentType(MediaType.APPLICATION_PDF)
+                    .body(pdfContent);
+
        } finally {
-            // Clean up the temporary files
-            Files.deleteIfExists(tempOutputFile);
-            // Comment out as transferTo makes multipart handle cleanup
-            // Files.deleteIfExists(tempInputFile);
-            if (sidecarTextPath != null) {
-                Files.deleteIfExists(sidecarTextPath);
+            // Clean up temporary files
+            deleteDirectory(tempDir);
+        }
+    }
+
+    private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
+            throws IOException {
+        if (!file.exists()) {
+            log.warn("File {} does not exist, skipping", file);
+            return;
+        }
+
+        try (FileInputStream fis = new FileInputStream(file)) {
+            ZipEntry zipEntry = new ZipEntry(filename);
+            zipOut.putNextEntry(zipEntry);
+
+            byte[] buffer = new byte[1024];
+            int length;
+            while ((length = fis.read(buffer)) >= 0) {
+                zipOut.write(buffer, 0, length);
            }
+
+            zipOut.closeEntry();
+        }
+    }
+
+    private void deleteDirectory(Path directory) {
+        try {
+            Files.walk(directory)
+                    .sorted(Comparator.reverseOrder())
+                    .forEach(
+                            path -> {
+                                try {
+                                    Files.delete(path);
+                                } catch (IOException e) {
+                                    log.error("Error deleting {}: {}", path, e.getMessage());
+                                }
+                            });
+        } catch (IOException e) {
+            log.error("Error walking directory {}: {}", directory, e.getMessage());
        }
    }
 }
--- a/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/misc/RepairController.java
@@ -44,7 +44,7 @@ public class RepairController {
    @Operation(
            summary = "Repair a PDF file",
            description =
-                    "This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response. Input:PDF Output:PDF Type:SISO")
+                    "This endpoint repairs a given PDF file by running qpdf command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response. Input:PDF Output:PDF Type:SISO")
    public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile request)
            throws IOException, InterruptedException {
        MultipartFile inputFile = request.getFileInput();
@@ -56,14 +56,15 @@ public class RepairController {
        try {

            List<String> command = new ArrayList<>();
-            command.add("gs");
-            command.add("-o");
-            command.add(tempOutputFile.toString());
-            command.add("-sDEVICE=pdfwrite");
+            command.add("qpdf");
+            command.add("--replace-input"); // Automatically fixes problems it can
+            command.add("--qdf"); // Linearizes and normalizes PDF structure
+            command.add("--object-streams=disable"); // Can help with some corruptions
            command.add(tempInputFile.toString());
+            command.add(tempOutputFile.toString());

            ProcessExecutorResult returnCode =
-                    ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
                            .runCommandWithOutputHandling(command);

            // Read the optimized PDF file