Removal of Ghostscript to use qpdf and tesseract directly (#2338)

* navbar fix multi tool and compress location

* release notes and ghostscript removal

* cleanups

* formatting

* update docs

* more

* more

* docs

* release bump

* Hardening suggestions for Stirling-PDF / ghostscript (#2339)

* Protect `readLine()` against DoS

* Sanitized user-provided file names in HTTP multipart uploads

---------

Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>

---------

Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
This commit is contained in:
Anthony Stirling
2024-11-26 20:50:35 +00:00
committed by GitHub
parent 654bc94d44
commit 833b3c45c6
69 changed files with 1106 additions and 665 deletions

View File

@@ -1,12 +1,13 @@
package stirling.software.SPDF.controller.api.converters;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.MediaType;
@@ -37,59 +38,90 @@ public class ConvertPDFToPDFA {
@Operation(
summary = "Convert a PDF to a PDF/A",
description =
"This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
"This endpoint converts a PDF file to a PDF/A file using LibreOffice. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PdfToPdfARequest request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
// Convert MultipartFile to byte[]
byte[] pdfBytes = inputFile.getBytes();
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
try (OutputStream outputStream = new FileOutputStream(tempInputFile.toFile())) {
outputStream.write(pdfBytes);
// Validate input file type
if (!"application/pdf".equals(inputFile.getContentType())) {
logger.error("Invalid input file type: {}", inputFile.getContentType());
throw new IllegalArgumentException("Input file must be a PDF");
}
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the ghostscript command
List<String> command = new ArrayList<>();
command.add("gs");
command.add("-dPDFA=" + ("pdfa".equals(outputFormat) ? "2" : "1"));
command.add("-dNOPAUSE");
command.add("-dBATCH");
command.add("-sColorConversionStrategy=sRGB");
command.add("-sDEVICE=pdfwrite");
command.add("-dPDFACompatibilityPolicy=2");
command.add("-o");
command.add(tempOutputFile.toString());
command.add(tempInputFile.toString());
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(command);
if (returnCode.getRc() != 0) {
logger.info(
outputFormat + " conversion failed with return code: " + returnCode.getRc());
// Get the original filename without extension
String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
if (originalFileName == null || originalFileName.trim().isEmpty()) {
originalFileName = "output.pdf";
}
String baseFileName =
originalFileName.contains(".")
? originalFileName.substring(0, originalFileName.lastIndexOf('.'))
: originalFileName;
Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes;
try {
byte[] pdfBytesOutput = Files.readAllBytes(tempOutputFile);
// Return the optimized PDF as a response
String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_PDFA.pdf";
// Save uploaded file to temp location
tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile);
// Create temp output directory
tempOutputDir = Files.createTempDirectory("output_");
// Determine PDF/A filter based on requested format
String pdfFilter =
"pdfa".equals(outputFormat)
? "writer_pdf_Export:{'SelectPdfVersion':{'Value':'2'}}:writer_pdf_Export"
: "writer_pdf_Export:{'SelectPdfVersion':{'Value':'1'}}:writer_pdf_Export";
// Prepare LibreOffice command
List<String> command =
new ArrayList<>(
Arrays.asList(
"soffice",
"--headless",
"--nologo",
"--convert-to",
"pdf:" + pdfFilter,
"--outdir",
tempOutputDir.toString(),
tempInputFile.toString()));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
if (returnCode.getRc() != 0) {
logger.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
throw new RuntimeException("PDF/A conversion failed");
}
// Get the output file
File[] outputFiles = tempOutputDir.toFile().listFiles();
if (outputFiles == null || outputFiles.length != 1) {
throw new RuntimeException(
"Expected exactly one output file but found "
+ (outputFiles == null ? "none" : outputFiles.length));
}
fileBytes = FileUtils.readFileToByteArray(outputFiles[0]);
String outputFilename = baseFileName + "_PDFA.pdf";
return WebResponseUtils.bytesToWebResponse(
pdfBytesOutput, outputFilename, MediaType.APPLICATION_PDF);
fileBytes, outputFilename, MediaType.APPLICATION_PDF);
} finally {
// Clean up the temporary files
Files.deleteIfExists(tempInputFile);
Files.deleteIfExists(tempOutputFile);
// Clean up temporary files
if (tempInputFile != null) {
Files.deleteIfExists(tempInputFile);
}
if (tempOutputDir != null) {
FileUtils.deleteDirectory(tempOutputDir.toFile());
}
}
}
}

View File

@@ -20,7 +20,7 @@ import org.springframework.web.bind.annotation.RestController;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.controller.api.CropController;
import stirling.software.SPDF.model.api.extract.PDFFilePage;
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
import technology.tabula.ObjectExtractor;
@@ -37,11 +37,15 @@ public class ExtractCSVController {
private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
@Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
@Operation(
summary = "Extracts a CSV document from a PDF",
description =
"This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
StringWriter writer = new StringWriter();
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
CSVFormat format =
CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
Writer csvWriter = new FlexibleCSVWriter(format);
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
try (ObjectExtractor extractor = new ObjectExtractor(document)) {
@@ -56,8 +60,8 @@ public class ExtractCSVController {
ContentDisposition.builder("attachment")
.filename(
form.getFileInput()
.getOriginalFilename()
.replaceFirst("[.][^.]+$", "")
.getOriginalFilename()
.replaceFirst("[.][^.]+$", "")
+ "_extracted.csv")
.build());
headers.setContentType(MediaType.parseMediaType("text/csv"));