Removal of Ghostscript to use qpdf and tesseract directly (#2338)
* navbar fix multi tool and compress location * release notes and ghostscript removal * cleanups * formatting * update docs * more * more * docs * release bump * Hardening suggestions for Stirling-PDF / ghostscript (#2339) * Protect `readLine()` against DoS * Sanitized user-provided file names in HTTP multipart uploads --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
This commit is contained in:
@@ -188,7 +188,7 @@ public class EndpointConfiguration {
|
||||
addEndpointToGroup("OpenCV", "extract-image-scans");
|
||||
|
||||
// LibreOffice
|
||||
addEndpointToGroup("LibreOffice", "repair");
|
||||
addEndpointToGroup("qpdf", "repair");
|
||||
addEndpointToGroup("LibreOffice", "file-to-pdf");
|
||||
addEndpointToGroup("LibreOffice", "pdf-to-word");
|
||||
addEndpointToGroup("LibreOffice", "pdf-to-presentation");
|
||||
@@ -199,10 +199,11 @@ public class EndpointConfiguration {
|
||||
// Unoconv
|
||||
addEndpointToGroup("Unoconv", "file-to-pdf");
|
||||
|
||||
// OCRmyPDF
|
||||
addEndpointToGroup("OCRmyPDF", "compress-pdf");
|
||||
addEndpointToGroup("OCRmyPDF", "pdf-to-pdfa");
|
||||
addEndpointToGroup("OCRmyPDF", "ocr-pdf");
|
||||
// qpdf
|
||||
addEndpointToGroup("qpdf", "compress-pdf");
|
||||
addEndpointToGroup("qpdf", "pdf-to-pdfa");
|
||||
|
||||
addEndpointToGroup("tesseract", "ocr-pdf");
|
||||
|
||||
// Java
|
||||
addEndpointToGroup("Java", "merge-pdfs");
|
||||
@@ -248,10 +249,10 @@ public class EndpointConfiguration {
|
||||
addEndpointToGroup("Javascript", "compare");
|
||||
addEndpointToGroup("Javascript", "adjust-contrast");
|
||||
|
||||
// Ghostscript dependent endpoints
|
||||
addEndpointToGroup("Ghostscript", "compress-pdf");
|
||||
addEndpointToGroup("Ghostscript", "pdf-to-pdfa");
|
||||
addEndpointToGroup("Ghostscript", "repair");
|
||||
// qpdf dependent endpoints
|
||||
addEndpointToGroup("qpdf", "compress-pdf");
|
||||
addEndpointToGroup("qpdf", "pdf-to-pdfa");
|
||||
addEndpointToGroup("qpdf", "repair");
|
||||
|
||||
// Weasyprint dependent endpoints
|
||||
addEndpointToGroup("Weasyprint", "html-to-pdf");
|
||||
|
||||
@@ -37,12 +37,13 @@ public class ExternalAppDepConfig {
|
||||
private final Map<String, List<String>> commandToGroupMapping =
|
||||
new HashMap<>() {
|
||||
{
|
||||
put("gs", List.of("Ghostscript"));
|
||||
put("soffice", List.of("LibreOffice"));
|
||||
put("ocrmypdf", List.of("OCRmyPDF"));
|
||||
put("weasyprint", List.of("Weasyprint"));
|
||||
put("pdftohtml", List.of("Pdftohtml"));
|
||||
put("unoconv", List.of("Unoconv"));
|
||||
put("qpdf", List.of("qpdf"));
|
||||
put("tesseract", List.of("tesseract"));
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
@@ -97,9 +98,9 @@ public class ExternalAppDepConfig {
|
||||
public void checkDependencies() {
|
||||
|
||||
// Check core dependencies
|
||||
checkDependencyAndDisableGroup("gs");
|
||||
checkDependencyAndDisableGroup("tesseract");
|
||||
checkDependencyAndDisableGroup("soffice");
|
||||
checkDependencyAndDisableGroup("ocrmypdf");
|
||||
checkDependencyAndDisableGroup("qpdf");
|
||||
checkDependencyAndDisableGroup("weasyprint");
|
||||
checkDependencyAndDisableGroup("pdftohtml");
|
||||
checkDependencyAndDisableGroup("unoconv");
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
package stirling.software.SPDF.controller.api.converters;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.MediaType;
|
||||
@@ -37,59 +38,90 @@ public class ConvertPDFToPDFA {
|
||||
@Operation(
|
||||
summary = "Convert a PDF to a PDF/A",
|
||||
description =
|
||||
"This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
|
||||
"This endpoint converts a PDF file to a PDF/A file using LibreOffice. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PdfToPdfARequest request)
|
||||
throws Exception {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
String outputFormat = request.getOutputFormat();
|
||||
|
||||
// Convert MultipartFile to byte[]
|
||||
byte[] pdfBytes = inputFile.getBytes();
|
||||
|
||||
// Save the uploaded file to a temporary location
|
||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
try (OutputStream outputStream = new FileOutputStream(tempInputFile.toFile())) {
|
||||
outputStream.write(pdfBytes);
|
||||
// Validate input file type
|
||||
if (!"application/pdf".equals(inputFile.getContentType())) {
|
||||
logger.error("Invalid input file type: {}", inputFile.getContentType());
|
||||
throw new IllegalArgumentException("Input file must be a PDF");
|
||||
}
|
||||
|
||||
// Prepare the output file path
|
||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
|
||||
// Prepare the ghostscript command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("gs");
|
||||
command.add("-dPDFA=" + ("pdfa".equals(outputFormat) ? "2" : "1"));
|
||||
command.add("-dNOPAUSE");
|
||||
command.add("-dBATCH");
|
||||
command.add("-sColorConversionStrategy=sRGB");
|
||||
command.add("-sDEVICE=pdfwrite");
|
||||
command.add("-dPDFACompatibilityPolicy=2");
|
||||
command.add("-o");
|
||||
command.add(tempOutputFile.toString());
|
||||
command.add(tempInputFile.toString());
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
if (returnCode.getRc() != 0) {
|
||||
logger.info(
|
||||
outputFormat + " conversion failed with return code: " + returnCode.getRc());
|
||||
// Get the original filename without extension
|
||||
String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
||||
if (originalFileName == null || originalFileName.trim().isEmpty()) {
|
||||
originalFileName = "output.pdf";
|
||||
}
|
||||
String baseFileName =
|
||||
originalFileName.contains(".")
|
||||
? originalFileName.substring(0, originalFileName.lastIndexOf('.'))
|
||||
: originalFileName;
|
||||
|
||||
Path tempInputFile = null;
|
||||
Path tempOutputDir = null;
|
||||
byte[] fileBytes;
|
||||
|
||||
try {
|
||||
byte[] pdfBytesOutput = Files.readAllBytes(tempOutputFile);
|
||||
// Return the optimized PDF as a response
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_PDFA.pdf";
|
||||
// Save uploaded file to temp location
|
||||
tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
inputFile.transferTo(tempInputFile);
|
||||
|
||||
// Create temp output directory
|
||||
tempOutputDir = Files.createTempDirectory("output_");
|
||||
|
||||
// Determine PDF/A filter based on requested format
|
||||
String pdfFilter =
|
||||
"pdfa".equals(outputFormat)
|
||||
? "writer_pdf_Export:{'SelectPdfVersion':{'Value':'2'}}:writer_pdf_Export"
|
||||
: "writer_pdf_Export:{'SelectPdfVersion':{'Value':'1'}}:writer_pdf_Export";
|
||||
|
||||
// Prepare LibreOffice command
|
||||
List<String> command =
|
||||
new ArrayList<>(
|
||||
Arrays.asList(
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--nologo",
|
||||
"--convert-to",
|
||||
"pdf:" + pdfFilter,
|
||||
"--outdir",
|
||||
tempOutputDir.toString(),
|
||||
tempInputFile.toString()));
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
if (returnCode.getRc() != 0) {
|
||||
logger.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
|
||||
throw new RuntimeException("PDF/A conversion failed");
|
||||
}
|
||||
|
||||
// Get the output file
|
||||
File[] outputFiles = tempOutputDir.toFile().listFiles();
|
||||
if (outputFiles == null || outputFiles.length != 1) {
|
||||
throw new RuntimeException(
|
||||
"Expected exactly one output file but found "
|
||||
+ (outputFiles == null ? "none" : outputFiles.length));
|
||||
}
|
||||
|
||||
fileBytes = FileUtils.readFileToByteArray(outputFiles[0]);
|
||||
String outputFilename = baseFileName + "_PDFA.pdf";
|
||||
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
pdfBytesOutput, outputFilename, MediaType.APPLICATION_PDF);
|
||||
fileBytes, outputFilename, MediaType.APPLICATION_PDF);
|
||||
|
||||
} finally {
|
||||
// Clean up the temporary files
|
||||
Files.deleteIfExists(tempInputFile);
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
// Clean up temporary files
|
||||
if (tempInputFile != null) {
|
||||
Files.deleteIfExists(tempInputFile);
|
||||
}
|
||||
if (tempOutputDir != null) {
|
||||
FileUtils.deleteDirectory(tempOutputDir.toFile());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import stirling.software.SPDF.controller.api.CropController;
|
||||
|
||||
import stirling.software.SPDF.model.api.extract.PDFFilePage;
|
||||
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
|
||||
import technology.tabula.ObjectExtractor;
|
||||
@@ -37,11 +37,15 @@ public class ExtractCSVController {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);
|
||||
|
||||
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
|
||||
@Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||
@Operation(
|
||||
summary = "Extracts a CSV document from a PDF",
|
||||
description =
|
||||
"This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
|
||||
StringWriter writer = new StringWriter();
|
||||
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
||||
CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
|
||||
CSVFormat format =
|
||||
CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
|
||||
Writer csvWriter = new FlexibleCSVWriter(format);
|
||||
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
|
||||
try (ObjectExtractor extractor = new ObjectExtractor(document)) {
|
||||
@@ -56,8 +60,8 @@ public class ExtractCSVController {
|
||||
ContentDisposition.builder("attachment")
|
||||
.filename(
|
||||
form.getFileInput()
|
||||
.getOriginalFilename()
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
.getOriginalFilename()
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_extracted.csv")
|
||||
.build());
|
||||
headers.setContentType(MediaType.parseMediaType("text/csv"));
|
||||
|
||||
@@ -10,7 +10,6 @@ import java.util.List;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
@@ -53,6 +52,54 @@ public class CompressController {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
private void compressImagesInPDF(Path pdfFile, double initialScaleFactor) throws Exception {
|
||||
byte[] fileBytes = Files.readAllBytes(pdfFile);
|
||||
try (PDDocument doc = Loader.loadPDF(fileBytes)) {
|
||||
double scaleFactor = initialScaleFactor;
|
||||
|
||||
for (PDPage page : doc.getPages()) {
|
||||
PDResources res = page.getResources();
|
||||
if (res != null && res.getXObjectNames() != null) {
|
||||
for (COSName name : res.getXObjectNames()) {
|
||||
PDXObject xobj = res.getXObject(name);
|
||||
if (xobj instanceof PDImageXObject) {
|
||||
PDImageXObject image = (PDImageXObject) xobj;
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
|
||||
int newWidth = (int) (bufferedImage.getWidth() * scaleFactor);
|
||||
int newHeight = (int) (bufferedImage.getHeight() * scaleFactor);
|
||||
|
||||
if (newWidth == 0 || newHeight == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Image scaledImage =
|
||||
bufferedImage.getScaledInstance(
|
||||
newWidth, newHeight, Image.SCALE_SMOOTH);
|
||||
|
||||
BufferedImage scaledBufferedImage =
|
||||
new BufferedImage(
|
||||
newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
|
||||
scaledBufferedImage.getGraphics().drawImage(scaledImage, 0, 0, null);
|
||||
|
||||
ByteArrayOutputStream compressedImageStream =
|
||||
new ByteArrayOutputStream();
|
||||
ImageIO.write(scaledBufferedImage, "jpeg", compressedImageStream);
|
||||
byte[] imageBytes = compressedImageStream.toByteArray();
|
||||
compressedImageStream.close();
|
||||
|
||||
PDImageXObject compressedImage =
|
||||
PDImageXObject.createFromByteArray(
|
||||
doc, imageBytes, image.getCOSObject().toString());
|
||||
res.put(name, compressedImage);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.save(pdfFile.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/compress-pdf")
|
||||
@Operation(
|
||||
summary = "Optimize PDF file",
|
||||
@@ -75,209 +122,92 @@ public class CompressController {
|
||||
autoMode = true;
|
||||
}
|
||||
|
||||
// Save the uploaded file to a temporary location
|
||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
inputFile.transferTo(tempInputFile.toFile());
|
||||
|
||||
long inputFileSize = Files.size(tempInputFile);
|
||||
|
||||
// Prepare the output file path
|
||||
|
||||
Path tempOutputFile = null;
|
||||
byte[] pdfBytes;
|
||||
try {
|
||||
tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
// Determine initial optimization level based on expected size reduction, only if in
|
||||
// autoMode
|
||||
|
||||
if (autoMode) {
|
||||
double sizeReductionRatio = expectedOutputSize / (double) inputFileSize;
|
||||
if (sizeReductionRatio > 0.7) {
|
||||
optimizeLevel = 1;
|
||||
} else if (sizeReductionRatio > 0.5) {
|
||||
optimizeLevel = 2;
|
||||
} else if (sizeReductionRatio > 0.35) {
|
||||
optimizeLevel = 3;
|
||||
} else {
|
||||
optimizeLevel = 3;
|
||||
}
|
||||
optimizeLevel = determineOptimizeLevel(sizeReductionRatio);
|
||||
}
|
||||
|
||||
boolean sizeMet = false;
|
||||
while (!sizeMet && optimizeLevel <= 4) {
|
||||
// Prepare the Ghostscript command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("gs");
|
||||
command.add("-sDEVICE=pdfwrite");
|
||||
command.add("-dCompatibilityLevel=1.5");
|
||||
while (!sizeMet && optimizeLevel <= 9) {
|
||||
|
||||
switch (optimizeLevel) {
|
||||
case 1:
|
||||
command.add("-dPDFSETTINGS=/prepress");
|
||||
break;
|
||||
case 2:
|
||||
command.add("-dPDFSETTINGS=/printer");
|
||||
break;
|
||||
case 3:
|
||||
command.add("-dPDFSETTINGS=/ebook");
|
||||
break;
|
||||
case 4:
|
||||
command.add("-dPDFSETTINGS=/screen");
|
||||
break;
|
||||
default:
|
||||
command.add("-dPDFSETTINGS=/default");
|
||||
// Apply additional image compression for levels 6-9
|
||||
if (optimizeLevel >= 6) {
|
||||
// Calculate scale factor based on optimization level
|
||||
double scaleFactor =
|
||||
switch (optimizeLevel) {
|
||||
case 6 -> 0.9; // 90% of original size
|
||||
case 7 -> 0.8; // 80% of original size
|
||||
case 8 -> 0.65; // 70% of original size
|
||||
case 9 -> 0.5; // 60% of original size
|
||||
default -> 1.0;
|
||||
};
|
||||
compressImagesInPDF(tempInputFile, scaleFactor);
|
||||
}
|
||||
|
||||
command.add("-dNOPAUSE");
|
||||
command.add("-dQUIET");
|
||||
command.add("-dBATCH");
|
||||
command.add("-sOutputFile=" + tempOutputFile.toString());
|
||||
// Run QPDF optimization
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("qpdf");
|
||||
if (request.getNormalize()) {
|
||||
command.add("--normalize-content=y");
|
||||
}
|
||||
if (request.getLinearize()) {
|
||||
command.add("--linearize");
|
||||
}
|
||||
command.add("--optimize-images");
|
||||
command.add("--recompress-flate");
|
||||
command.add("--compression-level=" + optimizeLevel);
|
||||
command.add("--compress-streams=y");
|
||||
command.add("--object-streams=generate");
|
||||
command.add(tempInputFile.toString());
|
||||
command.add(tempOutputFile.toString());
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
ProcessExecutorResult returnCode = null;
|
||||
try {
|
||||
returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
} catch (Exception e) {
|
||||
if (returnCode != null && returnCode.getRc() != 3) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if file size is within expected size or not auto mode so instantly finish
|
||||
// Check if file size is within expected size or not auto mode
|
||||
long outputFileSize = Files.size(tempOutputFile);
|
||||
if (outputFileSize <= expectedOutputSize || !autoMode) {
|
||||
sizeMet = true;
|
||||
} else {
|
||||
// Increase optimization level for next iteration
|
||||
optimizeLevel++;
|
||||
if (autoMode && optimizeLevel > 4) {
|
||||
logger.info("Skipping level 5 due to bad results in auto mode");
|
||||
optimizeLevel =
|
||||
incrementOptimizeLevel(
|
||||
optimizeLevel, outputFileSize, expectedOutputSize);
|
||||
if (autoMode && optimizeLevel > 9) {
|
||||
logger.info("Maximum compression level reached in auto mode");
|
||||
sizeMet = true;
|
||||
} else {
|
||||
logger.info(
|
||||
"Increasing ghostscript optimisation level to " + optimizeLevel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (expectedOutputSize != null && autoMode) {
|
||||
long outputFileSize = Files.size(tempOutputFile);
|
||||
byte[] fileBytes = Files.readAllBytes(tempOutputFile);
|
||||
if (outputFileSize > expectedOutputSize) {
|
||||
try (PDDocument doc = Loader.loadPDF(fileBytes)) {
|
||||
long previousFileSize = 0;
|
||||
double scaleFactorConst = 0.9f;
|
||||
double scaleFactor = 0.9f;
|
||||
while (true) {
|
||||
for (PDPage page : doc.getPages()) {
|
||||
PDResources res = page.getResources();
|
||||
if (res != null && res.getXObjectNames() != null) {
|
||||
for (COSName name : res.getXObjectNames()) {
|
||||
PDXObject xobj = res.getXObject(name);
|
||||
if (xobj != null && xobj instanceof PDImageXObject) {
|
||||
PDImageXObject image = (PDImageXObject) xobj;
|
||||
|
||||
// Get the image in BufferedImage format
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
|
||||
// Calculate the new dimensions
|
||||
int newWidth =
|
||||
(int)
|
||||
(bufferedImage.getWidth()
|
||||
* scaleFactorConst);
|
||||
int newHeight =
|
||||
(int)
|
||||
(bufferedImage.getHeight()
|
||||
* scaleFactorConst);
|
||||
|
||||
// If the new dimensions are zero, skip this iteration
|
||||
if (newWidth == 0 || newHeight == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, proceed with the scaling
|
||||
Image scaledImage =
|
||||
bufferedImage.getScaledInstance(
|
||||
newWidth,
|
||||
newHeight,
|
||||
Image.SCALE_SMOOTH);
|
||||
|
||||
// Convert the scaled image back to a BufferedImage
|
||||
BufferedImage scaledBufferedImage =
|
||||
new BufferedImage(
|
||||
newWidth,
|
||||
newHeight,
|
||||
BufferedImage.TYPE_INT_RGB);
|
||||
scaledBufferedImage
|
||||
.getGraphics()
|
||||
.drawImage(scaledImage, 0, 0, null);
|
||||
|
||||
// Compress the scaled image
|
||||
ByteArrayOutputStream compressedImageStream =
|
||||
new ByteArrayOutputStream();
|
||||
ImageIO.write(
|
||||
scaledBufferedImage,
|
||||
"jpeg",
|
||||
compressedImageStream);
|
||||
byte[] imageBytes = compressedImageStream.toByteArray();
|
||||
compressedImageStream.close();
|
||||
|
||||
PDImageXObject compressedImage =
|
||||
PDImageXObject.createFromByteArray(
|
||||
doc,
|
||||
imageBytes,
|
||||
image.getCOSObject().toString());
|
||||
|
||||
// Replace the image in the resources with the
|
||||
// compressed
|
||||
// version
|
||||
res.put(name, compressedImage);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// save the document to tempOutputFile again
|
||||
doc.save(tempOutputFile.toString());
|
||||
|
||||
long currentSize = Files.size(tempOutputFile);
|
||||
// Check if the overall PDF size is still larger than expectedOutputSize
|
||||
if (currentSize > expectedOutputSize) {
|
||||
// Log the current file size and scaleFactor
|
||||
|
||||
logger.info(
|
||||
"Current file size: "
|
||||
+ FileUtils.byteCountToDisplaySize(currentSize));
|
||||
logger.info("Current scale factor: " + scaleFactor);
|
||||
|
||||
// The file is still too large, reduce scaleFactor and try again
|
||||
scaleFactor *= 0.9f; // reduce scaleFactor by 10%
|
||||
// Avoid scaleFactor being too small, causing the image to shrink to
|
||||
// 0
|
||||
if (scaleFactor < 0.2f || previousFileSize == currentSize) {
|
||||
throw new RuntimeException(
|
||||
"Could not reach the desired size without excessively degrading image quality, lowest size recommended is "
|
||||
+ FileUtils.byteCountToDisplaySize(currentSize)
|
||||
+ ", "
|
||||
+ currentSize
|
||||
+ " bytes");
|
||||
}
|
||||
previousFileSize = currentSize;
|
||||
} else {
|
||||
// The file is small enough, break the loop
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Read the optimized PDF file
|
||||
pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
Path finalFile = tempOutputFile;
|
||||
|
||||
// Check if optimized file is larger than the original
|
||||
if (pdfBytes.length > inputFileSize) {
|
||||
// Log the occurrence
|
||||
logger.warn(
|
||||
"Optimized file is larger than the original. Returning the original file instead.");
|
||||
|
||||
// Read the original file again
|
||||
finalFile = tempInputFile;
|
||||
}
|
||||
// Return the optimized PDF as a response
|
||||
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
@@ -286,10 +216,31 @@ public class CompressController {
|
||||
pdfDocumentFactory.load(finalFile.toFile()), outputFilename);
|
||||
|
||||
} finally {
|
||||
// Clean up the temporary files
|
||||
// deleted by multipart file handler deu to transferTo?
|
||||
// Files.deleteIfExists(tempInputFile);
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
}
|
||||
}
|
||||
|
||||
private int determineOptimizeLevel(double sizeReductionRatio) {
|
||||
if (sizeReductionRatio > 0.9) return 1;
|
||||
if (sizeReductionRatio > 0.8) return 2;
|
||||
if (sizeReductionRatio > 0.7) return 3;
|
||||
if (sizeReductionRatio > 0.6) return 4;
|
||||
if (sizeReductionRatio > 0.5) return 5;
|
||||
if (sizeReductionRatio > 0.4) return 6;
|
||||
if (sizeReductionRatio > 0.3) return 7;
|
||||
if (sizeReductionRatio > 0.2) return 8;
|
||||
return 9;
|
||||
}
|
||||
|
||||
private int incrementOptimizeLevel(int currentLevel, long currentSize, long targetSize) {
|
||||
double currentRatio = currentSize / (double) targetSize;
|
||||
logger.info("Current compression ratio: {}", String.format("%.2f", currentRatio));
|
||||
|
||||
if (currentRatio > 2.0) {
|
||||
return Math.min(9, currentLevel + 3);
|
||||
} else if (currentRatio > 1.5) {
|
||||
return Math.min(9, currentLevel + 2);
|
||||
}
|
||||
return Math.min(9, currentLevel + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,7 +58,7 @@ public class FakeScanControllerWIP {
|
||||
@Operation(
|
||||
summary = "Repair a PDF file",
|
||||
description =
|
||||
"This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
|
||||
"This endpoint repairs a given PDF file by running qpdf command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
|
||||
public ResponseEntity<byte[]> fakeScan(@ModelAttribute PDFFile request) throws IOException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
|
||||
|
||||
@@ -1,19 +1,31 @@
|
||||
package stirling.software.SPDF.controller.api.misc;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import io.github.pixee.security.BoundedLineReader;
|
||||
import io.github.pixee.security.Filenames;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
@@ -23,24 +35,29 @@ import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import stirling.software.SPDF.model.ApplicationProperties;
|
||||
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
||||
import stirling.software.SPDF.service.CustomPDDocumentFactory;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/misc")
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
@Slf4j
|
||||
public class OCRController {
|
||||
|
||||
@Autowired ApplicationProperties applicationProperties;
|
||||
@Autowired private ApplicationProperties applicationProperties;
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public OCRController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
/** Gets the list of available Tesseract languages from the tessdata directory */
|
||||
public List<String> getAvailableTesseractLanguages() {
|
||||
String tessdataDir = applicationProperties.getSystem().getTessdataDir();
|
||||
File[] files = new File(tessdataDir).listFiles();
|
||||
@@ -54,196 +71,161 @@ public class OCRController {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@Autowired
|
||||
public OCRController(CustomPDDocumentFactory pdfDocumentFactory) {
|
||||
this.pdfDocumentFactory = pdfDocumentFactory;
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
||||
@Operation(
|
||||
summary = "Process a PDF file with OCR",
|
||||
description =
|
||||
"This endpoint processes a PDF file using OCR (Optical Character Recognition). Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. Input:PDF Output:PDF Type:SI-Conditional")
|
||||
public ResponseEntity<byte[]> processPdfWithOCR(
|
||||
@ModelAttribute ProcessPdfWithOcrRequest request)
|
||||
throws IOException, InterruptedException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
List<String> selectedLanguages = request.getLanguages();
|
||||
Boolean sidecar = request.isSidecar();
|
||||
Boolean deskew = request.isDeskew();
|
||||
Boolean clean = request.isClean();
|
||||
Boolean cleanFinal = request.isCleanFinal();
|
||||
List<String> languages = request.getLanguages();
|
||||
String ocrType = request.getOcrType();
|
||||
String ocrRenderType = request.getOcrRenderType();
|
||||
Boolean removeImagesAfter = request.isRemoveImagesAfter();
|
||||
// --output-type pdfa
|
||||
if (selectedLanguages == null || selectedLanguages.isEmpty()) {
|
||||
throw new IOException("Please select at least one language.");
|
||||
}
|
||||
|
||||
if (!"hocr".equals(ocrRenderType) && !"sandwich".equals(ocrRenderType)) {
|
||||
throw new IOException("ocrRenderType wrong");
|
||||
}
|
||||
Path tempDir = Files.createTempDirectory("ocr_process");
|
||||
Path tempInputFile = tempDir.resolve("input.pdf");
|
||||
Path tempOutputDir = tempDir.resolve("output");
|
||||
Path tempImagesDir = tempDir.resolve("images");
|
||||
Path finalOutputFile = tempDir.resolve("final_output.pdf");
|
||||
|
||||
// Get available Tesseract languages
|
||||
List<String> availableLanguages = getAvailableTesseractLanguages();
|
||||
|
||||
// Validate selected languages
|
||||
selectedLanguages =
|
||||
selectedLanguages.stream().filter(availableLanguages::contains).toList();
|
||||
|
||||
if (selectedLanguages.isEmpty()) {
|
||||
throw new IOException("None of the selected languages are valid.");
|
||||
}
|
||||
// Save the uploaded file to a temporary location
|
||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
Path sidecarTextPath = null;
|
||||
Files.createDirectories(tempOutputDir);
|
||||
Files.createDirectories(tempImagesDir);
|
||||
|
||||
try {
|
||||
// Save input file
|
||||
inputFile.transferTo(tempInputFile.toFile());
|
||||
PDFMergerUtility merger = new PDFMergerUtility();
|
||||
merger.setDestinationFileName(finalOutputFile.toString());
|
||||
|
||||
// Run OCR Command
|
||||
String languageOption = String.join("+", selectedLanguages);
|
||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
int pageCount = document.getNumberOfPages();
|
||||
|
||||
List<String> command =
|
||||
new ArrayList<>(
|
||||
Arrays.asList(
|
||||
"ocrmypdf",
|
||||
"--verbose",
|
||||
"2",
|
||||
"--output-type",
|
||||
"pdf",
|
||||
"--pdf-renderer",
|
||||
ocrRenderType));
|
||||
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
||||
PDPage page = document.getPage(pageNum);
|
||||
boolean hasText = false;
|
||||
|
||||
if (sidecar != null && sidecar) {
|
||||
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
||||
command.add("--sidecar");
|
||||
command.add(sidecarTextPath.toString());
|
||||
}
|
||||
// Check for existing text
|
||||
try (PDDocument tempDoc = new PDDocument()) {
|
||||
tempDoc.addPage(page);
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
hasText = !stripper.getText(tempDoc).trim().isEmpty();
|
||||
}
|
||||
|
||||
if (deskew != null && deskew) {
|
||||
command.add("--deskew");
|
||||
}
|
||||
if (clean != null && clean) {
|
||||
command.add("--clean");
|
||||
}
|
||||
if (cleanFinal != null && cleanFinal) {
|
||||
command.add("--clean-final");
|
||||
}
|
||||
if (ocrType != null && !"".equals(ocrType)) {
|
||||
if ("skip-text".equals(ocrType)) {
|
||||
command.add("--skip-text");
|
||||
} else if ("force-ocr".equals(ocrType)) {
|
||||
command.add("--force-ocr");
|
||||
} else if ("Normal".equals(ocrType)) {
|
||||
boolean shouldOcr =
|
||||
switch (ocrType) {
|
||||
case "skip-text" -> !hasText;
|
||||
case "force-ocr" -> true;
|
||||
default -> true;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
Path pageOutputPath =
|
||||
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
|
||||
|
||||
command.addAll(
|
||||
Arrays.asList(
|
||||
"--language",
|
||||
languageOption,
|
||||
tempInputFile.toString(),
|
||||
tempOutputFile.toString()));
|
||||
if (shouldOcr) {
|
||||
// Convert page to image
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
|
||||
Path imagePath =
|
||||
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
|
||||
ImageIO.write(image, "png", imagePath.toFile());
|
||||
|
||||
// Run CLI command
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
if (result.getRc() != 0
|
||||
&& result.getMessages().contains("multiprocessing/synchronize.py")
|
||||
&& result.getMessages()
|
||||
.contains("OSError: [Errno 38] Function not implemented")) {
|
||||
command.add("--jobs");
|
||||
command.add("1");
|
||||
result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
}
|
||||
// Build OCR command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("tesseract");
|
||||
command.add(imagePath.toString());
|
||||
command.add(
|
||||
tempOutputDir
|
||||
.resolve(String.format("page_%d", pageNum))
|
||||
.toString());
|
||||
command.add("-l");
|
||||
command.add(String.join("+", languages));
|
||||
command.add("pdf"); // Always output PDF
|
||||
|
||||
// Remove images from the OCR processed PDF if the flag is set to true
|
||||
if (removeImagesAfter != null && removeImagesAfter) {
|
||||
Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
|
||||
ProcessBuilder pb = new ProcessBuilder(command);
|
||||
Process process = pb.start();
|
||||
|
||||
List<String> gsCommand =
|
||||
Arrays.asList(
|
||||
"gs",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dFILTERIMAGE",
|
||||
"-o",
|
||||
tempPdfWithoutImages.toString(),
|
||||
tempOutputFile.toString());
|
||||
// Capture any error output
|
||||
try (BufferedReader reader =
|
||||
new BufferedReader(
|
||||
new InputStreamReader(process.getErrorStream()))) {
|
||||
String line;
|
||||
while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) {
|
||||
log.debug("Tesseract: {}", line);
|
||||
}
|
||||
}
|
||||
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(gsCommand);
|
||||
tempOutputFile = tempPdfWithoutImages;
|
||||
}
|
||||
// Read the OCR processed PDF file
|
||||
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempOutputFile.toFile());
|
||||
int exitCode = process.waitFor();
|
||||
if (exitCode != 0) {
|
||||
throw new RuntimeException(
|
||||
"Tesseract failed with exit code: " + exitCode);
|
||||
}
|
||||
|
||||
// Return the OCR processed PDF as a response
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.pdf";
|
||||
|
||||
if (sidecar != null && sidecar) {
|
||||
// Create a zip file containing both the PDF and the text file
|
||||
String outputZipFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.zip";
|
||||
Path tempZipFile = Files.createTempFile("output_", ".zip");
|
||||
|
||||
try (ZipOutputStream zipOut =
|
||||
new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
|
||||
// Add PDF file to the zip
|
||||
ZipEntry pdfEntry = new ZipEntry(outputFilename);
|
||||
zipOut.putNextEntry(pdfEntry);
|
||||
try (ByteArrayInputStream pdfInputStream = new ByteArrayInputStream(pdfBytes)) {
|
||||
byte[] buffer = new byte[1024];
|
||||
int length;
|
||||
while ((length = pdfInputStream.read(buffer)) != -1) {
|
||||
zipOut.write(buffer, 0, length);
|
||||
// Add OCR'd PDF to merger
|
||||
merger.addSource(pageOutputPath.toFile());
|
||||
} else {
|
||||
// Save original page without OCR
|
||||
try (PDDocument pageDoc = new PDDocument()) {
|
||||
pageDoc.addPage(page);
|
||||
pageDoc.save(pageOutputPath.toFile());
|
||||
merger.addSource(pageOutputPath.toFile());
|
||||
}
|
||||
}
|
||||
zipOut.closeEntry();
|
||||
|
||||
// Add text file to the zip
|
||||
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
||||
zipOut.putNextEntry(txtEntry);
|
||||
Files.copy(sidecarTextPath, zipOut);
|
||||
zipOut.closeEntry();
|
||||
}
|
||||
|
||||
byte[] zipBytes = Files.readAllBytes(tempZipFile);
|
||||
|
||||
// Clean up the temporary zip file
|
||||
Files.deleteIfExists(tempZipFile);
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
Files.deleteIfExists(sidecarTextPath);
|
||||
|
||||
// Return the zip file containing both the PDF and the text file
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
|
||||
} else {
|
||||
// Return the OCR processed PDF as a response
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||
}
|
||||
|
||||
// Merge all pages into final PDF
|
||||
merger.mergeDocuments(null);
|
||||
|
||||
// Read the final PDF file
|
||||
byte[] pdfContent = Files.readAllBytes(finalOutputFile);
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
||||
|
||||
return ResponseEntity.ok()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
"attachment; filename=\"" + outputFilename + "\"")
|
||||
.contentType(MediaType.APPLICATION_PDF)
|
||||
.body(pdfContent);
|
||||
|
||||
} finally {
|
||||
// Clean up the temporary files
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
// Comment out as transferTo makes multipart handle cleanup
|
||||
// Files.deleteIfExists(tempInputFile);
|
||||
if (sidecarTextPath != null) {
|
||||
Files.deleteIfExists(sidecarTextPath);
|
||||
// Clean up temporary files
|
||||
deleteDirectory(tempDir);
|
||||
}
|
||||
}
|
||||
|
||||
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
|
||||
throws IOException {
|
||||
if (!file.exists()) {
|
||||
log.warn("File {} does not exist, skipping", file);
|
||||
return;
|
||||
}
|
||||
|
||||
try (FileInputStream fis = new FileInputStream(file)) {
|
||||
ZipEntry zipEntry = new ZipEntry(filename);
|
||||
zipOut.putNextEntry(zipEntry);
|
||||
|
||||
byte[] buffer = new byte[1024];
|
||||
int length;
|
||||
while ((length = fis.read(buffer)) >= 0) {
|
||||
zipOut.write(buffer, 0, length);
|
||||
}
|
||||
|
||||
zipOut.closeEntry();
|
||||
}
|
||||
}
|
||||
|
||||
private void deleteDirectory(Path directory) {
|
||||
try {
|
||||
Files.walk(directory)
|
||||
.sorted(Comparator.reverseOrder())
|
||||
.forEach(
|
||||
path -> {
|
||||
try {
|
||||
Files.delete(path);
|
||||
} catch (IOException e) {
|
||||
log.error("Error deleting {}: {}", path, e.getMessage());
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
log.error("Error walking directory {}: {}", directory, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ public class RepairController {
|
||||
@Operation(
|
||||
summary = "Repair a PDF file",
|
||||
description =
|
||||
"This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response. Input:PDF Output:PDF Type:SISO")
|
||||
"This endpoint repairs a given PDF file by running qpdf command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile request)
|
||||
throws IOException, InterruptedException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
@@ -56,14 +56,15 @@ public class RepairController {
|
||||
try {
|
||||
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("gs");
|
||||
command.add("-o");
|
||||
command.add(tempOutputFile.toString());
|
||||
command.add("-sDEVICE=pdfwrite");
|
||||
command.add("qpdf");
|
||||
command.add("--replace-input"); // Automatically fixes problems it can
|
||||
command.add("--qdf"); // Linearizes and normalizes PDF structure
|
||||
command.add("--object-streams=disable"); // Can help with some corruptions
|
||||
command.add(tempInputFile.toString());
|
||||
command.add(tempOutputFile.toString());
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the optimized PDF file
|
||||
|
||||
@@ -98,10 +98,10 @@ public class CertSignController {
|
||||
|
||||
public CreateSignature(KeyStore keystore, char[] pin)
|
||||
throws KeyStoreException,
|
||||
UnrecoverableKeyException,
|
||||
NoSuchAlgorithmException,
|
||||
IOException,
|
||||
CertificateException {
|
||||
UnrecoverableKeyException,
|
||||
NoSuchAlgorithmException,
|
||||
IOException,
|
||||
CertificateException {
|
||||
super(keystore, pin);
|
||||
ClassPathResource resource = new ClassPathResource("static/images/signature.png");
|
||||
try (InputStream is = resource.getInputStream()) {
|
||||
@@ -160,7 +160,8 @@ public class CertSignController {
|
||||
extState.setNonStrokingAlphaConstant(0.5f);
|
||||
cs.setGraphicsStateParameters(extState);
|
||||
cs.transform(Matrix.getScaleInstance(0.08f, 0.08f));
|
||||
PDImageXObject img = PDImageXObject.createFromFileByExtension(logoFile, doc);
|
||||
PDImageXObject img =
|
||||
PDImageXObject.createFromFileByExtension(logoFile, doc);
|
||||
cs.drawImage(img, 100, 0);
|
||||
cs.restoreGraphicsState();
|
||||
}
|
||||
@@ -208,7 +209,10 @@ public class CertSignController {
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/cert-sign")
|
||||
@Operation(summary = "Sign PDF with a Digital Certificate", description = "This endpoint accepts a PDF file, a digital certificate and related information to sign the PDF. It then returns the digitally signed PDF file. Input:PDF Output:PDF Type:SISO")
|
||||
@Operation(
|
||||
summary = "Sign PDF with a Digital Certificate",
|
||||
description =
|
||||
"This endpoint accepts a PDF file, a digital certificate and related information to sign the PDF. It then returns the digitally signed PDF file. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> signPDFWithCert(@ModelAttribute SignPDFWithCertRequest request)
|
||||
throws Exception {
|
||||
MultipartFile pdf = request.getFileInput();
|
||||
@@ -238,7 +242,7 @@ public class CertSignController {
|
||||
PrivateKey privateKey = getPrivateKeyFromPEM(privateKeyFile.getBytes(), password);
|
||||
Certificate cert = (Certificate) getCertificateFromPEM(certFile.getBytes());
|
||||
ks.setKeyEntry(
|
||||
"alias", privateKey, password.toCharArray(), new Certificate[] { cert });
|
||||
"alias", privateKey, password.toCharArray(), new Certificate[] {cert});
|
||||
break;
|
||||
case "PKCS12":
|
||||
ks = KeyStore.getInstance("PKCS12");
|
||||
@@ -310,19 +314,22 @@ public class CertSignController {
|
||||
|
||||
private PrivateKey getPrivateKeyFromPEM(byte[] pemBytes, String password)
|
||||
throws IOException, OperatorCreationException, PKCSException {
|
||||
try (PEMParser pemParser = new PEMParser(new InputStreamReader(new ByteArrayInputStream(pemBytes)))) {
|
||||
try (PEMParser pemParser =
|
||||
new PEMParser(new InputStreamReader(new ByteArrayInputStream(pemBytes)))) {
|
||||
Object pemObject = pemParser.readObject();
|
||||
JcaPEMKeyConverter converter = new JcaPEMKeyConverter().setProvider("BC");
|
||||
PrivateKeyInfo pkInfo;
|
||||
if (pemObject instanceof PKCS8EncryptedPrivateKeyInfo) {
|
||||
InputDecryptorProvider decProv = new JceOpenSSLPKCS8DecryptorProviderBuilder()
|
||||
.build(password.toCharArray());
|
||||
InputDecryptorProvider decProv =
|
||||
new JceOpenSSLPKCS8DecryptorProviderBuilder().build(password.toCharArray());
|
||||
pkInfo = ((PKCS8EncryptedPrivateKeyInfo) pemObject).decryptPrivateKeyInfo(decProv);
|
||||
} else if (pemObject instanceof PEMEncryptedKeyPair) {
|
||||
PEMDecryptorProvider decProv = new JcePEMDecryptorProviderBuilder().build(password.toCharArray());
|
||||
pkInfo = ((PEMEncryptedKeyPair) pemObject)
|
||||
.decryptKeyPair(decProv)
|
||||
.getPrivateKeyInfo();
|
||||
PEMDecryptorProvider decProv =
|
||||
new JcePEMDecryptorProviderBuilder().build(password.toCharArray());
|
||||
pkInfo =
|
||||
((PEMEncryptedKeyPair) pemObject)
|
||||
.decryptKeyPair(decProv)
|
||||
.getPrivateKeyInfo();
|
||||
} else {
|
||||
pkInfo = ((PEMKeyPair) pemObject).getPrivateKeyInfo();
|
||||
}
|
||||
|
||||
@@ -55,6 +55,11 @@ public class HomeWebController {
|
||||
return "licenses";
|
||||
}
|
||||
|
||||
@GetMapping("/releases")
|
||||
public String getReleaseNotes(Model model) {
|
||||
return "releases";
|
||||
}
|
||||
|
||||
@GetMapping("/")
|
||||
public String home(Model model) {
|
||||
model.addAttribute("currentPage", "home");
|
||||
|
||||
@@ -320,12 +320,20 @@ public class ApplicationProperties {
|
||||
public static class SessionLimit {
|
||||
private int libreOfficeSessionLimit;
|
||||
private int pdfToHtmlSessionLimit;
|
||||
private int ocrMyPdfSessionLimit;
|
||||
private int pythonOpenCvSessionLimit;
|
||||
private int ghostScriptSessionLimit;
|
||||
private int weasyPrintSessionLimit;
|
||||
private int installAppSessionLimit;
|
||||
private int calibreSessionLimit;
|
||||
private int qpdfSessionLimit;
|
||||
private int tesseractSessionLimit;
|
||||
|
||||
public int getQpdfSessionLimit() {
|
||||
return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
|
||||
}
|
||||
|
||||
public int getTesseractSessionLimit() {
|
||||
return tesseractSessionLimit > 0 ? tesseractSessionLimit : 1;
|
||||
}
|
||||
|
||||
public int getLibreOfficeSessionLimit() {
|
||||
return libreOfficeSessionLimit > 0 ? libreOfficeSessionLimit : 1;
|
||||
@@ -335,18 +343,10 @@ public class ApplicationProperties {
|
||||
return pdfToHtmlSessionLimit > 0 ? pdfToHtmlSessionLimit : 1;
|
||||
}
|
||||
|
||||
public int getOcrMyPdfSessionLimit() {
|
||||
return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
|
||||
}
|
||||
|
||||
public int getPythonOpenCvSessionLimit() {
|
||||
return pythonOpenCvSessionLimit > 0 ? pythonOpenCvSessionLimit : 8;
|
||||
}
|
||||
|
||||
public int getGhostScriptSessionLimit() {
|
||||
return ghostScriptSessionLimit > 0 ? ghostScriptSessionLimit : 16;
|
||||
}
|
||||
|
||||
public int getWeasyPrintSessionLimit() {
|
||||
return weasyPrintSessionLimit > 0 ? weasyPrintSessionLimit : 16;
|
||||
}
|
||||
@@ -364,12 +364,20 @@ public class ApplicationProperties {
|
||||
public static class TimeoutMinutes {
|
||||
private long libreOfficeTimeoutMinutes;
|
||||
private long pdfToHtmlTimeoutMinutes;
|
||||
private long ocrMyPdfTimeoutMinutes;
|
||||
private long pythonOpenCvTimeoutMinutes;
|
||||
private long ghostScriptTimeoutMinutes;
|
||||
private long weasyPrintTimeoutMinutes;
|
||||
private long installAppTimeoutMinutes;
|
||||
private long calibreTimeoutMinutes;
|
||||
private long tesseractTimeoutMinutes;
|
||||
private long qpdfTimeoutMinutes;
|
||||
|
||||
public long getTesseractTimeoutMinutes() {
|
||||
return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
public long getQpdfTimeoutMinutes() {
|
||||
return qpdfTimeoutMinutes > 0 ? qpdfTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
public long getLibreOfficeTimeoutMinutes() {
|
||||
return libreOfficeTimeoutMinutes > 0 ? libreOfficeTimeoutMinutes : 30;
|
||||
@@ -379,18 +387,10 @@ public class ApplicationProperties {
|
||||
return pdfToHtmlTimeoutMinutes > 0 ? pdfToHtmlTimeoutMinutes : 20;
|
||||
}
|
||||
|
||||
public long getOcrMyPdfTimeoutMinutes() {
|
||||
return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
public long getPythonOpenCvTimeoutMinutes() {
|
||||
return pythonOpenCvTimeoutMinutes > 0 ? pythonOpenCvTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
public long getGhostScriptTimeoutMinutes() {
|
||||
return ghostScriptTimeoutMinutes > 0 ? ghostScriptTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
public long getWeasyPrintTimeoutMinutes() {
|
||||
return weasyPrintTimeoutMinutes > 0 ? weasyPrintTimeoutMinutes : 30;
|
||||
}
|
||||
|
||||
@@ -18,4 +18,15 @@ public class OptimizePdfRequest extends PDFFile {
|
||||
|
||||
@Schema(description = "The expected output size, e.g. '100MB', '25KB', etc.")
|
||||
private String expectedOutputSize;
|
||||
|
||||
@Schema(
|
||||
description = "Whether to linearize the PDF for faster web viewing. Default is false.",
|
||||
defaultValue = "false")
|
||||
private Boolean linearize = false;
|
||||
|
||||
@Schema(
|
||||
description =
|
||||
"Whether to normalize the PDF content for better compatibility. Default is true.",
|
||||
defaultValue = "true")
|
||||
private Boolean normalize = true;
|
||||
}
|
||||
|
||||
@@ -15,18 +15,6 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
|
||||
@Schema(description = "List of languages to use in OCR processing")
|
||||
private List<String> languages;
|
||||
|
||||
@Schema(description = "Include OCR text in a sidecar text file if set to true")
|
||||
private boolean sidecar;
|
||||
|
||||
@Schema(description = "Deskew the input file if set to true")
|
||||
private boolean deskew;
|
||||
|
||||
@Schema(description = "Clean the input file if set to true")
|
||||
private boolean clean;
|
||||
|
||||
@Schema(description = "Clean the final output if set to true")
|
||||
private boolean cleanFinal;
|
||||
|
||||
@Schema(
|
||||
description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
|
||||
allowableValues = {"skip-text", "force-ocr", "Normal"})
|
||||
@@ -37,7 +25,4 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
|
||||
allowableValues = {"hocr", "sandwich"},
|
||||
defaultValue = "hocr")
|
||||
private String ocrRenderType = "hocr";
|
||||
|
||||
@Schema(description = "Remove images from the output PDF if set to true")
|
||||
private boolean removeImagesAfter;
|
||||
}
|
||||
|
||||
@@ -34,17 +34,15 @@ public class MetricsAggregatorService {
|
||||
counter -> {
|
||||
String method = counter.getId().getTag("method");
|
||||
String uri = counter.getId().getTag("uri");
|
||||
|
||||
|
||||
// Skip if either method or uri is null
|
||||
if (method == null || uri == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
String key = String.format(
|
||||
"http_requests_%s_%s",
|
||||
method,
|
||||
uri.replace("/", "_")
|
||||
);
|
||||
|
||||
String key =
|
||||
String.format(
|
||||
"http_requests_%s_%s", method, uri.replace("/", "_"));
|
||||
|
||||
double currentCount = counter.count();
|
||||
double lastCount = lastSentMetrics.getOrDefault(key, 0.0);
|
||||
|
||||
@@ -31,7 +31,7 @@ public class PostHogService {
|
||||
private final ApplicationProperties applicationProperties;
|
||||
private final UserServiceInterface userService;
|
||||
private final Environment env;
|
||||
|
||||
|
||||
@Autowired
|
||||
public PostHogService(
|
||||
PostHog postHog,
|
||||
@@ -71,16 +71,16 @@ public class PostHogService {
|
||||
Map<String, Object> metrics = new HashMap<>();
|
||||
|
||||
try {
|
||||
//Application version
|
||||
metrics.put("app_version", appVersion);
|
||||
String deploymentType = "JAR"; // default
|
||||
if ("true".equalsIgnoreCase(env.getProperty("BROWSER_OPEN"))) {
|
||||
deploymentType = "EXE";
|
||||
} else if (isRunningInDocker()) {
|
||||
deploymentType = "DOCKER";
|
||||
}
|
||||
metrics.put("deployment_type", deploymentType);
|
||||
|
||||
// Application version
|
||||
metrics.put("app_version", appVersion);
|
||||
String deploymentType = "JAR"; // default
|
||||
if ("true".equalsIgnoreCase(env.getProperty("BROWSER_OPEN"))) {
|
||||
deploymentType = "EXE";
|
||||
} else if (isRunningInDocker()) {
|
||||
deploymentType = "DOCKER";
|
||||
}
|
||||
metrics.put("deployment_type", deploymentType);
|
||||
|
||||
// System info
|
||||
metrics.put("os_name", System.getProperty("os.name"));
|
||||
metrics.put("os_version", System.getProperty("os.version"));
|
||||
|
||||
@@ -29,12 +29,12 @@ public class ProcessExecutor {
|
||||
public enum Processes {
|
||||
LIBRE_OFFICE,
|
||||
PDFTOHTML,
|
||||
OCR_MY_PDF,
|
||||
PYTHON_OPENCV,
|
||||
GHOSTSCRIPT,
|
||||
WEASYPRINT,
|
||||
INSTALL_APP,
|
||||
CALIBRE
|
||||
CALIBRE,
|
||||
TESSERACT,
|
||||
QPDF
|
||||
}
|
||||
|
||||
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
||||
@@ -59,21 +59,11 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getPdfToHtmlSessionLimit();
|
||||
case OCR_MY_PDF ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getOcrMyPdfSessionLimit();
|
||||
case PYTHON_OPENCV ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getPythonOpenCvSessionLimit();
|
||||
case GHOSTSCRIPT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getGhostScriptSessionLimit();
|
||||
case WEASYPRINT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
@@ -84,6 +74,16 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getInstallAppSessionLimit();
|
||||
case TESSERACT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getTesseractSessionLimit();
|
||||
case QPDF ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getQpdfSessionLimit();
|
||||
case CALIBRE ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
@@ -103,21 +103,11 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getPdfToHtmlTimeoutMinutes();
|
||||
case OCR_MY_PDF ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getOcrMyPdfTimeoutMinutes();
|
||||
case PYTHON_OPENCV ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getPythonOpenCvTimeoutMinutes();
|
||||
case GHOSTSCRIPT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getGhostScriptTimeoutMinutes();
|
||||
case WEASYPRINT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
@@ -128,6 +118,16 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getInstallAppTimeoutMinutes();
|
||||
case TESSERACT ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getTesseractTimeoutMinutes();
|
||||
case QPDF ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getQpdfTimeoutMinutes();
|
||||
case CALIBRE ->
|
||||
applicationProperties
|
||||
.getProcessExecutor()
|
||||
|
||||
Reference in New Issue
Block a user