Removal of Ghostscript to use qpdf and tesseract directly (#2338)
* navbar fix multi tool and compress location * release notes and ghostscript removal * cleanups * formatting * update docs * more * more * docs * release bump * Hardening suggestions for Stirling-PDF / ghostscript (#2339) * Protect `readLine()` against DoS * Sanitized user-provided file names in HTTP multipart uploads --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
This commit is contained in:
@@ -10,7 +10,6 @@ import java.util.List;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
@@ -53,6 +52,54 @@ public class CompressController {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
private void compressImagesInPDF(Path pdfFile, double initialScaleFactor) throws Exception {
|
||||
byte[] fileBytes = Files.readAllBytes(pdfFile);
|
||||
try (PDDocument doc = Loader.loadPDF(fileBytes)) {
|
||||
double scaleFactor = initialScaleFactor;
|
||||
|
||||
for (PDPage page : doc.getPages()) {
|
||||
PDResources res = page.getResources();
|
||||
if (res != null && res.getXObjectNames() != null) {
|
||||
for (COSName name : res.getXObjectNames()) {
|
||||
PDXObject xobj = res.getXObject(name);
|
||||
if (xobj instanceof PDImageXObject) {
|
||||
PDImageXObject image = (PDImageXObject) xobj;
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
|
||||
int newWidth = (int) (bufferedImage.getWidth() * scaleFactor);
|
||||
int newHeight = (int) (bufferedImage.getHeight() * scaleFactor);
|
||||
|
||||
if (newWidth == 0 || newHeight == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Image scaledImage =
|
||||
bufferedImage.getScaledInstance(
|
||||
newWidth, newHeight, Image.SCALE_SMOOTH);
|
||||
|
||||
BufferedImage scaledBufferedImage =
|
||||
new BufferedImage(
|
||||
newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
|
||||
scaledBufferedImage.getGraphics().drawImage(scaledImage, 0, 0, null);
|
||||
|
||||
ByteArrayOutputStream compressedImageStream =
|
||||
new ByteArrayOutputStream();
|
||||
ImageIO.write(scaledBufferedImage, "jpeg", compressedImageStream);
|
||||
byte[] imageBytes = compressedImageStream.toByteArray();
|
||||
compressedImageStream.close();
|
||||
|
||||
PDImageXObject compressedImage =
|
||||
PDImageXObject.createFromByteArray(
|
||||
doc, imageBytes, image.getCOSObject().toString());
|
||||
res.put(name, compressedImage);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.save(pdfFile.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/compress-pdf")
|
||||
@Operation(
|
||||
summary = "Optimize PDF file",
|
||||
@@ -75,209 +122,92 @@ public class CompressController {
|
||||
autoMode = true;
|
||||
}
|
||||
|
||||
// Save the uploaded file to a temporary location
|
||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
inputFile.transferTo(tempInputFile.toFile());
|
||||
|
||||
long inputFileSize = Files.size(tempInputFile);
|
||||
|
||||
// Prepare the output file path
|
||||
|
||||
Path tempOutputFile = null;
|
||||
byte[] pdfBytes;
|
||||
try {
|
||||
tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
// Determine initial optimization level based on expected size reduction, only if in
|
||||
// autoMode
|
||||
|
||||
if (autoMode) {
|
||||
double sizeReductionRatio = expectedOutputSize / (double) inputFileSize;
|
||||
if (sizeReductionRatio > 0.7) {
|
||||
optimizeLevel = 1;
|
||||
} else if (sizeReductionRatio > 0.5) {
|
||||
optimizeLevel = 2;
|
||||
} else if (sizeReductionRatio > 0.35) {
|
||||
optimizeLevel = 3;
|
||||
} else {
|
||||
optimizeLevel = 3;
|
||||
}
|
||||
optimizeLevel = determineOptimizeLevel(sizeReductionRatio);
|
||||
}
|
||||
|
||||
boolean sizeMet = false;
|
||||
while (!sizeMet && optimizeLevel <= 4) {
|
||||
// Prepare the Ghostscript command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("gs");
|
||||
command.add("-sDEVICE=pdfwrite");
|
||||
command.add("-dCompatibilityLevel=1.5");
|
||||
while (!sizeMet && optimizeLevel <= 9) {
|
||||
|
||||
switch (optimizeLevel) {
|
||||
case 1:
|
||||
command.add("-dPDFSETTINGS=/prepress");
|
||||
break;
|
||||
case 2:
|
||||
command.add("-dPDFSETTINGS=/printer");
|
||||
break;
|
||||
case 3:
|
||||
command.add("-dPDFSETTINGS=/ebook");
|
||||
break;
|
||||
case 4:
|
||||
command.add("-dPDFSETTINGS=/screen");
|
||||
break;
|
||||
default:
|
||||
command.add("-dPDFSETTINGS=/default");
|
||||
// Apply additional image compression for levels 6-9
|
||||
if (optimizeLevel >= 6) {
|
||||
// Calculate scale factor based on optimization level
|
||||
double scaleFactor =
|
||||
switch (optimizeLevel) {
|
||||
case 6 -> 0.9; // 90% of original size
|
||||
case 7 -> 0.8; // 80% of original size
|
||||
case 8 -> 0.65; // 70% of original size
|
||||
case 9 -> 0.5; // 60% of original size
|
||||
default -> 1.0;
|
||||
};
|
||||
compressImagesInPDF(tempInputFile, scaleFactor);
|
||||
}
|
||||
|
||||
command.add("-dNOPAUSE");
|
||||
command.add("-dQUIET");
|
||||
command.add("-dBATCH");
|
||||
command.add("-sOutputFile=" + tempOutputFile.toString());
|
||||
// Run QPDF optimization
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("qpdf");
|
||||
if (request.getNormalize()) {
|
||||
command.add("--normalize-content=y");
|
||||
}
|
||||
if (request.getLinearize()) {
|
||||
command.add("--linearize");
|
||||
}
|
||||
command.add("--optimize-images");
|
||||
command.add("--recompress-flate");
|
||||
command.add("--compression-level=" + optimizeLevel);
|
||||
command.add("--compress-streams=y");
|
||||
command.add("--object-streams=generate");
|
||||
command.add(tempInputFile.toString());
|
||||
command.add(tempOutputFile.toString());
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
ProcessExecutorResult returnCode = null;
|
||||
try {
|
||||
returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
} catch (Exception e) {
|
||||
if (returnCode != null && returnCode.getRc() != 3) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if file size is within expected size or not auto mode so instantly finish
|
||||
// Check if file size is within expected size or not auto mode
|
||||
long outputFileSize = Files.size(tempOutputFile);
|
||||
if (outputFileSize <= expectedOutputSize || !autoMode) {
|
||||
sizeMet = true;
|
||||
} else {
|
||||
// Increase optimization level for next iteration
|
||||
optimizeLevel++;
|
||||
if (autoMode && optimizeLevel > 4) {
|
||||
logger.info("Skipping level 5 due to bad results in auto mode");
|
||||
optimizeLevel =
|
||||
incrementOptimizeLevel(
|
||||
optimizeLevel, outputFileSize, expectedOutputSize);
|
||||
if (autoMode && optimizeLevel > 9) {
|
||||
logger.info("Maximum compression level reached in auto mode");
|
||||
sizeMet = true;
|
||||
} else {
|
||||
logger.info(
|
||||
"Increasing ghostscript optimisation level to " + optimizeLevel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (expectedOutputSize != null && autoMode) {
|
||||
long outputFileSize = Files.size(tempOutputFile);
|
||||
byte[] fileBytes = Files.readAllBytes(tempOutputFile);
|
||||
if (outputFileSize > expectedOutputSize) {
|
||||
try (PDDocument doc = Loader.loadPDF(fileBytes)) {
|
||||
long previousFileSize = 0;
|
||||
double scaleFactorConst = 0.9f;
|
||||
double scaleFactor = 0.9f;
|
||||
while (true) {
|
||||
for (PDPage page : doc.getPages()) {
|
||||
PDResources res = page.getResources();
|
||||
if (res != null && res.getXObjectNames() != null) {
|
||||
for (COSName name : res.getXObjectNames()) {
|
||||
PDXObject xobj = res.getXObject(name);
|
||||
if (xobj != null && xobj instanceof PDImageXObject) {
|
||||
PDImageXObject image = (PDImageXObject) xobj;
|
||||
|
||||
// Get the image in BufferedImage format
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
|
||||
// Calculate the new dimensions
|
||||
int newWidth =
|
||||
(int)
|
||||
(bufferedImage.getWidth()
|
||||
* scaleFactorConst);
|
||||
int newHeight =
|
||||
(int)
|
||||
(bufferedImage.getHeight()
|
||||
* scaleFactorConst);
|
||||
|
||||
// If the new dimensions are zero, skip this iteration
|
||||
if (newWidth == 0 || newHeight == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, proceed with the scaling
|
||||
Image scaledImage =
|
||||
bufferedImage.getScaledInstance(
|
||||
newWidth,
|
||||
newHeight,
|
||||
Image.SCALE_SMOOTH);
|
||||
|
||||
// Convert the scaled image back to a BufferedImage
|
||||
BufferedImage scaledBufferedImage =
|
||||
new BufferedImage(
|
||||
newWidth,
|
||||
newHeight,
|
||||
BufferedImage.TYPE_INT_RGB);
|
||||
scaledBufferedImage
|
||||
.getGraphics()
|
||||
.drawImage(scaledImage, 0, 0, null);
|
||||
|
||||
// Compress the scaled image
|
||||
ByteArrayOutputStream compressedImageStream =
|
||||
new ByteArrayOutputStream();
|
||||
ImageIO.write(
|
||||
scaledBufferedImage,
|
||||
"jpeg",
|
||||
compressedImageStream);
|
||||
byte[] imageBytes = compressedImageStream.toByteArray();
|
||||
compressedImageStream.close();
|
||||
|
||||
PDImageXObject compressedImage =
|
||||
PDImageXObject.createFromByteArray(
|
||||
doc,
|
||||
imageBytes,
|
||||
image.getCOSObject().toString());
|
||||
|
||||
// Replace the image in the resources with the
|
||||
// compressed
|
||||
// version
|
||||
res.put(name, compressedImage);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// save the document to tempOutputFile again
|
||||
doc.save(tempOutputFile.toString());
|
||||
|
||||
long currentSize = Files.size(tempOutputFile);
|
||||
// Check if the overall PDF size is still larger than expectedOutputSize
|
||||
if (currentSize > expectedOutputSize) {
|
||||
// Log the current file size and scaleFactor
|
||||
|
||||
logger.info(
|
||||
"Current file size: "
|
||||
+ FileUtils.byteCountToDisplaySize(currentSize));
|
||||
logger.info("Current scale factor: " + scaleFactor);
|
||||
|
||||
// The file is still too large, reduce scaleFactor and try again
|
||||
scaleFactor *= 0.9f; // reduce scaleFactor by 10%
|
||||
// Avoid scaleFactor being too small, causing the image to shrink to
|
||||
// 0
|
||||
if (scaleFactor < 0.2f || previousFileSize == currentSize) {
|
||||
throw new RuntimeException(
|
||||
"Could not reach the desired size without excessively degrading image quality, lowest size recommended is "
|
||||
+ FileUtils.byteCountToDisplaySize(currentSize)
|
||||
+ ", "
|
||||
+ currentSize
|
||||
+ " bytes");
|
||||
}
|
||||
previousFileSize = currentSize;
|
||||
} else {
|
||||
// The file is small enough, break the loop
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Read the optimized PDF file
|
||||
pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
Path finalFile = tempOutputFile;
|
||||
|
||||
// Check if optimized file is larger than the original
|
||||
if (pdfBytes.length > inputFileSize) {
|
||||
// Log the occurrence
|
||||
logger.warn(
|
||||
"Optimized file is larger than the original. Returning the original file instead.");
|
||||
|
||||
// Read the original file again
|
||||
finalFile = tempInputFile;
|
||||
}
|
||||
// Return the optimized PDF as a response
|
||||
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
@@ -286,10 +216,31 @@ public class CompressController {
|
||||
pdfDocumentFactory.load(finalFile.toFile()), outputFilename);
|
||||
|
||||
} finally {
|
||||
// Clean up the temporary files
|
||||
// deleted by multipart file handler deu to transferTo?
|
||||
// Files.deleteIfExists(tempInputFile);
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
}
|
||||
}
|
||||
|
||||
private int determineOptimizeLevel(double sizeReductionRatio) {
|
||||
if (sizeReductionRatio > 0.9) return 1;
|
||||
if (sizeReductionRatio > 0.8) return 2;
|
||||
if (sizeReductionRatio > 0.7) return 3;
|
||||
if (sizeReductionRatio > 0.6) return 4;
|
||||
if (sizeReductionRatio > 0.5) return 5;
|
||||
if (sizeReductionRatio > 0.4) return 6;
|
||||
if (sizeReductionRatio > 0.3) return 7;
|
||||
if (sizeReductionRatio > 0.2) return 8;
|
||||
return 9;
|
||||
}
|
||||
|
||||
private int incrementOptimizeLevel(int currentLevel, long currentSize, long targetSize) {
|
||||
double currentRatio = currentSize / (double) targetSize;
|
||||
logger.info("Current compression ratio: {}", String.format("%.2f", currentRatio));
|
||||
|
||||
if (currentRatio > 2.0) {
|
||||
return Math.min(9, currentLevel + 3);
|
||||
} else if (currentRatio > 1.5) {
|
||||
return Math.min(9, currentLevel + 2);
|
||||
}
|
||||
return Math.min(9, currentLevel + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,7 +58,7 @@ public class FakeScanControllerWIP {
|
||||
@Operation(
|
||||
summary = "Repair a PDF file",
|
||||
description =
|
||||
"This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
|
||||
"This endpoint repairs a given PDF file by running qpdf command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
|
||||
public ResponseEntity<byte[]> fakeScan(@ModelAttribute PDFFile request) throws IOException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
|
||||
|
||||
@@ -1,19 +1,31 @@
|
||||
package stirling.software.SPDF.controller.api.misc;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import io.github.pixee.security.BoundedLineReader;
|
||||
import io.github.pixee.security.Filenames;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
@@ -23,24 +35,29 @@ import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import stirling.software.SPDF.model.ApplicationProperties;
|
||||
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/misc")
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
@Slf4j
|
||||
public class OCRController {
|
||||
|
||||
@Autowired ApplicationProperties applicationProperties;
|
||||
@Autowired private ApplicationProperties applicationProperties;
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public OCRController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
/** Gets the list of available Tesseract languages from the tessdata directory */
|
||||
public List<String> getAvailableTesseractLanguages() {
|
||||
String tessdataDir = applicationProperties.getSystem().getTessdataDir();
|
||||
File[] files = new File(tessdataDir).listFiles();
|
||||
@@ -54,196 +71,161 @@ public class OCRController {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public OCRController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
||||
@Operation(
|
||||
summary = "Process a PDF file with OCR",
|
||||
description =
|
||||
"This endpoint processes a PDF file using OCR (Optical Character Recognition). Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. Input:PDF Output:PDF Type:SI-Conditional")
|
||||
public ResponseEntity<byte[]> processPdfWithOCR(
|
||||
@ModelAttribute ProcessPdfWithOcrRequest request)
|
||||
throws IOException, InterruptedException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
List<String> selectedLanguages = request.getLanguages();
|
||||
Boolean sidecar = request.isSidecar();
|
||||
Boolean deskew = request.isDeskew();
|
||||
Boolean clean = request.isClean();
|
||||
Boolean cleanFinal = request.isCleanFinal();
|
||||
List<String> languages = request.getLanguages();
|
||||
String ocrType = request.getOcrType();
|
||||
String ocrRenderType = request.getOcrRenderType();
|
||||
Boolean removeImagesAfter = request.isRemoveImagesAfter();
|
||||
// --output-type pdfa
|
||||
if (selectedLanguages == null || selectedLanguages.isEmpty()) {
|
||||
throw new IOException("Please select at least one language.");
|
||||
}
|
||||
|
||||
if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
|
||||
throw new IOException("ocrRenderType wrong");
|
||||
}
|
||||
Path tempDir = Files.createTempDirectory("ocr_process");
|
||||
Path tempInputFile = tempDir.resolve("input.pdf");
|
||||
Path tempOutputDir = tempDir.resolve("output");
|
||||
Path tempImagesDir = tempDir.resolve("images");
|
||||
Path finalOutputFile = tempDir.resolve("final_output.pdf");
|
||||
|
||||
// Get available Tesseract languages
|
||||
List<String> availableLanguages = getAvailableTesseractLanguages();
|
||||
|
||||
// Validate selected languages
|
||||
selectedLanguages =
|
||||
selectedLanguages.stream().filter(availableLanguages::contains).toList();
|
||||
|
||||
if (selectedLanguages.isEmpty()) {
|
||||
throw new IOException("None of the selected languages are valid.");
|
||||
}
|
||||
// Save the uploaded file to a temporary location
|
||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
Path sidecarTextPath = null;
|
||||
Files.createDirectories(tempOutputDir);
|
||||
Files.createDirectories(tempImagesDir);
|
||||
|
||||
try {
|
||||
// Save input file
|
||||
inputFile.transferTo(tempInputFile.toFile());
|
||||
PDFMergerUtility merger = new PDFMergerUtility();
|
||||
merger.setDestinationFileName(finalOutputFile.toString());
|
||||
|
||||
// Run OCR Command
|
||||
String languageOption = String.join("+", selectedLanguages);
|
||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
int pageCount = document.getNumberOfPages();
|
||||
|
||||
List<String> command =
|
||||
new ArrayList<>(
|
||||
Arrays.asList(
|
||||
"ocrmypdf",
|
||||
"--verbose",
|
||||
"2",
|
||||
"--output-type",
|
||||
"pdf",
|
||||
"--pdf-renderer",
|
||||
ocrRenderType));
|
||||
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
||||
PDPage page = document.getPage(pageNum);
|
||||
boolean hasText = false;
|
||||
|
||||
if (sidecar != null && sidecar) {
|
||||
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
||||
command.add("--sidecar");
|
||||
command.add(sidecarTextPath.toString());
|
||||
}
|
||||
// Check for existing text
|
||||
try (PDDocument tempDoc = new PDDocument()) {
|
||||
tempDoc.addPage(page);
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
hasText = !stripper.getText(tempDoc).trim().isEmpty();
|
||||
}
|
||||
|
||||
if (deskew != null && deskew) {
|
||||
command.add("--deskew");
|
||||
}
|
||||
if (clean != null && clean) {
|
||||
command.add("--clean");
|
||||
}
|
||||
if (cleanFinal != null && cleanFinal) {
|
||||
command.add("--clean-final");
|
||||
}
|
||||
if (ocrType != null && !"".equals(ocrType)) {
|
||||
if ("skip-text".equals(ocrType)) {
|
||||
command.add("--skip-text");
|
||||
} else if ("force-ocr".equals(ocrType)) {
|
||||
command.add("--force-ocr");
|
||||
} else if ("Normal".equals(ocrType)) {
|
||||
boolean shouldOcr =
|
||||
switch (ocrType) {
|
||||
case "skip-text" -> !hasText;
|
||||
case "force-ocr" -> true;
|
||||
default -> true;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
Path pageOutputPath =
|
||||
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
|
||||
|
||||
command.addAll(
|
||||
Arrays.asList(
|
||||
"--language",
|
||||
languageOption,
|
||||
tempInputFile.toString(),
|
||||
tempOutputFile.toString()));
|
||||
if (shouldOcr) {
|
||||
// Convert page to image
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
|
||||
Path imagePath =
|
||||
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
|
||||
ImageIO.write(image, "png", imagePath.toFile());
|
||||
|
||||
// Run CLI command
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
if (result.getRc() != 0
|
||||
&& result.getMessages().contains("multiprocessing/synchronize.py")
|
||||
&& result.getMessages()
|
||||
.contains("OSError: [Errno 38] Function not implemented")) {
|
||||
command.add("--jobs");
|
||||
command.add("1");
|
||||
result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
}
|
||||
// Build OCR command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("tesseract");
|
||||
command.add(imagePath.toString());
|
||||
command.add(
|
||||
tempOutputDir
|
||||
.resolve(String.format("page_%d", pageNum))
|
||||
.toString());
|
||||
command.add("-l");
|
||||
command.add(String.join("+", languages));
|
||||
command.add("pdf"); // Always output PDF
|
||||
|
||||
// Remove images from the OCR processed PDF if the flag is set to true
|
||||
if (removeImagesAfter != null && removeImagesAfter) {
|
||||
Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
|
||||
ProcessBuilder pb = new ProcessBuilder(command);
|
||||
Process process = pb.start();
|
||||
|
||||
List<String> gsCommand =
|
||||
Arrays.asList(
|
||||
"gs",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dFILTERIMAGE",
|
||||
"-o",
|
||||
tempPdfWithoutImages.toString(),
|
||||
tempOutputFile.toString());
|
||||
// Capture any error output
|
||||
try (BufferedReader reader =
|
||||
new BufferedReader(
|
||||
new InputStreamReader(process.getErrorStream()))) {
|
||||
String line;
|
||||
while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) {
|
||||
log.debug("Tesseract: {}", line);
|
||||
}
|
||||
}
|
||||
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(gsCommand);
|
||||
tempOutputFile = tempPdfWithoutImages;
|
||||
}
|
||||
// Read the OCR processed PDF file
|
||||
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.toFile());
|
||||
int exitCode = process.waitFor();
|
||||
if (exitCode != 0) {
|
||||
throw new RuntimeException(
|
||||
"Tesseract failed with exit code: " + exitCode);
|
||||
}
|
||||
|
||||
// Return the OCR processed PDF as a response
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.pdf";
|
||||
|
||||
if (sidecar != null && sidecar) {
|
||||
// Create a zip file containing both the PDF and the text file
|
||||
String outputZipFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.zip";
|
||||
Path tempZipFile = Files.createTempFile("output_", ".zip");
|
||||
|
||||
try (ZipOutputStream zipOut =
|
||||
new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
|
||||
// Add PDF file to the zip
|
||||
ZipEntry pdfEntry = new ZipEntry(outputFilename);
|
||||
zipOut.putNextEntry(pdfEntry);
|
||||
try (ByteArrayInputStream pdfInputStream = new ByteArrayInputStream(pdfBytes)) {
|
||||
byte[] buffer = new byte[1024];
|
||||
int length;
|
||||
while ((length = pdfInputStream.read(buffer)) != -1) {
|
||||
zipOut.write(buffer, 0, length);
|
||||
// Add OCR'd PDF to merger
|
||||
merger.addSource(pageOutputPath.toFile());
|
||||
} else {
|
||||
// Save original page without OCR
|
||||
try (PDDocument pageDoc = new PDDocument()) {
|
||||
pageDoc.addPage(page);
|
||||
pageDoc.save(pageOutputPath.toFile());
|
||||
merger.addSource(pageOutputPath.toFile());
|
||||
}
|
||||
}
|
||||
zipOut.closeEntry();
|
||||
|
||||
// Add text file to the zip
|
||||
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
||||
zipOut.putNextEntry(txtEntry);
|
||||
Files.copy(sidecarTextPath, zipOut);
|
||||
zipOut.closeEntry();
|
||||
}
|
||||
|
||||
byte[] zipBytes = Files.readAllBytes(tempZipFile);
|
||||
|
||||
// Clean up the temporary zip file
|
||||
Files.deleteIfExists(tempZipFile);
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
Files.deleteIfExists(sidecarTextPath);
|
||||
|
||||
// Return the zip file containing both the PDF and the text file
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
|
||||
} else {
|
||||
// Return the OCR processed PDF as a response
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||
}
|
||||
|
||||
// Merge all pages into final PDF
|
||||
merger.mergeDocuments(null);
|
||||
|
||||
// Read the final PDF file
|
||||
byte[] pdfContent = Files.readAllBytes(finalOutputFile);
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
||||
|
||||
return ResponseEntity.ok()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
"attachment; filename=\"" + outputFilename + "\"")
|
||||
.contentType(MediaType.APPLICATION_PDF)
|
||||
.body(pdfContent);
|
||||
|
||||
} finally {
|
||||
// Clean up the temporary files
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
// Comment out as transferTo makes multipart handle cleanup
|
||||
// Files.deleteIfExists(tempInputFile);
|
||||
if (sidecarTextPath != null) {
|
||||
Files.deleteIfExists(sidecarTextPath);
|
||||
// Clean up temporary files
|
||||
deleteDirectory(tempDir);
|
||||
}
|
||||
}
|
||||
|
||||
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
|
||||
throws IOException {
|
||||
if (!file.exists()) {
|
||||
log.warn("File {} does not exist, skipping", file);
|
||||
return;
|
||||
}
|
||||
|
||||
try (FileInputStream fis = new FileInputStream(file)) {
|
||||
ZipEntry zipEntry = new ZipEntry(filename);
|
||||
zipOut.putNextEntry(zipEntry);
|
||||
|
||||
byte[] buffer = new byte[1024];
|
||||
int length;
|
||||
while ((length = fis.read(buffer)) >= 0) {
|
||||
zipOut.write(buffer, 0, length);
|
||||
}
|
||||
|
||||
zipOut.closeEntry();
|
||||
}
|
||||
}
|
||||
|
||||
private void deleteDirectory(Path directory) {
|
||||
try {
|
||||
Files.walk(directory)
|
||||
.sorted(Comparator.reverseOrder())
|
||||
.forEach(
|
||||
path -> {
|
||||
try {
|
||||
Files.delete(path);
|
||||
} catch (IOException e) {
|
||||
log.error("Error deleting {}: {}", path, e.getMessage());
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
log.error("Error walking directory {}: {}", directory, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ public class RepairController {
|
||||
@Operation(
|
||||
summary = "Repair a PDF file",
|
||||
description =
|
||||
"This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response. Input:PDF Output:PDF Type:SISO")
|
||||
"This endpoint repairs a given PDF file by running qpdf command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile request)
|
||||
throws IOException, InterruptedException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
@@ -56,14 +56,15 @@ public class RepairController {
|
||||
try {
|
||||
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("gs");
|
||||
command.add("-o");
|
||||
command.add(tempOutputFile.toString());
|
||||
command.add("-sDEVICE=pdfwrite");
|
||||
command.add("qpdf");
|
||||
command.add("--replace-input"); // Automatically fixes problems it can
|
||||
command.add("--qdf"); // Linearizes and normalizes PDF structure
|
||||
command.add("--object-streams=disable"); // Can help with some corruptions
|
||||
command.add(tempInputFile.toString());
|
||||
command.add(tempOutputFile.toString());
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the optimized PDF file
|
||||
|
||||
Reference in New Issue
Block a user