Removal of Ghostscript to use qpdf and tesseract directly (#2338)
* navbar fix multi tool and compress location * release notes and ghostscript removal * cleanups * formatting * update docs * more * more * docs * release bump * Hardening suggestions for Stirling-PDF / ghostscript (#2339) * Protect `readLine()` against DoS * Sanitized user-provided file names in HTTP multipart uploads --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
This commit is contained in:
@@ -1,12 +1,13 @@
|
||||
package stirling.software.SPDF.controller.api.converters;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.MediaType;
|
||||
@@ -37,59 +38,90 @@ public class ConvertPDFToPDFA {
|
||||
@Operation(
|
||||
summary = "Convert a PDF to a PDF/A",
|
||||
description =
|
||||
"This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
|
||||
"This endpoint converts a PDF file to a PDF/A file using LibreOffice. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PdfToPdfARequest request)
|
||||
throws Exception {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
String outputFormat = request.getOutputFormat();
|
||||
|
||||
// Convert MultipartFile to byte[]
|
||||
byte[] pdfBytes = inputFile.getBytes();
|
||||
|
||||
// Save the uploaded file to a temporary location
|
||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
try (OutputStream outputStream = new FileOutputStream(tempInputFile.toFile())) {
|
||||
outputStream.write(pdfBytes);
|
||||
// Validate input file type
|
||||
if (!"application/pdf".equals(inputFile.getContentType())) {
|
||||
logger.error("Invalid input file type: {}", inputFile.getContentType());
|
||||
throw new IllegalArgumentException("Input file must be a PDF");
|
||||
}
|
||||
|
||||
// Prepare the output file path
|
||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
|
||||
// Prepare the ghostscript command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("gs");
|
||||
command.add("-dPDFA=" + ("pdfa".equals(outputFormat) ? "2" : "1"));
|
||||
command.add("-dNOPAUSE");
|
||||
command.add("-dBATCH");
|
||||
command.add("-sColorConversionStrategy=sRGB");
|
||||
command.add("-sDEVICE=pdfwrite");
|
||||
command.add("-dPDFACompatibilityPolicy=2");
|
||||
command.add("-o");
|
||||
command.add(tempOutputFile.toString());
|
||||
command.add(tempInputFile.toString());
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
if (returnCode.getRc() != 0) {
|
||||
logger.info(
|
||||
outputFormat + " conversion failed with return code: " + returnCode.getRc());
|
||||
// Get the original filename without extension
|
||||
String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
||||
if (originalFileName == null || originalFileName.trim().isEmpty()) {
|
||||
originalFileName = "output.pdf";
|
||||
}
|
||||
String baseFileName =
|
||||
originalFileName.contains(".")
|
||||
? originalFileName.substring(0, originalFileName.lastIndexOf('.'))
|
||||
: originalFileName;
|
||||
|
||||
Path tempInputFile = null;
|
||||
Path tempOutputDir = null;
|
||||
byte[] fileBytes;
|
||||
|
||||
try {
|
||||
byte[] pdfBytesOutput = Files.readAllBytes(tempOutputFile);
|
||||
// Return the optimized PDF as a response
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_PDFA.pdf";
|
||||
// Save uploaded file to temp location
|
||||
tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
inputFile.transferTo(tempInputFile);
|
||||
|
||||
// Create temp output directory
|
||||
tempOutputDir = Files.createTempDirectory("output_");
|
||||
|
||||
// Determine PDF/A filter based on requested format
|
||||
String pdfFilter =
|
||||
"pdfa".equals(outputFormat)
|
||||
? "writer_pdf_Export:{'SelectPdfVersion':{'Value':'2'}}:writer_pdf_Export"
|
||||
: "writer_pdf_Export:{'SelectPdfVersion':{'Value':'1'}}:writer_pdf_Export";
|
||||
|
||||
// Prepare LibreOffice command
|
||||
List<String> command =
|
||||
new ArrayList<>(
|
||||
Arrays.asList(
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--nologo",
|
||||
"--convert-to",
|
||||
"pdf:" + pdfFilter,
|
||||
"--outdir",
|
||||
tempOutputDir.toString(),
|
||||
tempInputFile.toString()));
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
if (returnCode.getRc() != 0) {
|
||||
logger.error("PDF/A conversion failed with return code: {}", returnCode.getRc());
|
||||
throw new RuntimeException("PDF/A conversion failed");
|
||||
}
|
||||
|
||||
// Get the output file
|
||||
File[] outputFiles = tempOutputDir.toFile().listFiles();
|
||||
if (outputFiles == null || outputFiles.length != 1) {
|
||||
throw new RuntimeException(
|
||||
"Expected exactly one output file but found "
|
||||
+ (outputFiles == null ? "none" : outputFiles.length));
|
||||
}
|
||||
|
||||
fileBytes = FileUtils.readFileToByteArray(outputFiles[0]);
|
||||
String outputFilename = baseFileName + "_PDFA.pdf";
|
||||
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
pdfBytesOutput, outputFilename, MediaType.APPLICATION_PDF);
|
||||
fileBytes, outputFilename, MediaType.APPLICATION_PDF);
|
||||
|
||||
} finally {
|
||||
// Clean up the temporary files
|
||||
Files.deleteIfExists(tempInputFile);
|
||||
Files.deleteIfExists(tempOutputFile);
|
||||
// Clean up temporary files
|
||||
if (tempInputFile != null) {
|
||||
Files.deleteIfExists(tempInputFile);
|
||||
}
|
||||
if (tempOutputDir != null) {
|
||||
FileUtils.deleteDirectory(tempOutputDir.toFile());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import stirling.software.SPDF.controller.api.CropController;
|
||||
|
||||
import stirling.software.SPDF.model.api.extract.PDFFilePage;
|
||||
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
|
||||
import technology.tabula.ObjectExtractor;
|
||||
@@ -37,11 +37,15 @@ public class ExtractCSVController {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);
|
||||
|
||||
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
|
||||
@Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||
@Operation(
|
||||
summary = "Extracts a CSV document from a PDF",
|
||||
description =
|
||||
"This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
|
||||
StringWriter writer = new StringWriter();
|
||||
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
||||
CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
|
||||
CSVFormat format =
|
||||
CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
|
||||
Writer csvWriter = new FlexibleCSVWriter(format);
|
||||
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
|
||||
try (ObjectExtractor extractor = new ObjectExtractor(document)) {
|
||||
@@ -56,8 +60,8 @@ public class ExtractCSVController {
|
||||
ContentDisposition.builder("attachment")
|
||||
.filename(
|
||||
form.getFileInput()
|
||||
.getOriginalFilename()
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
.getOriginalFilename()
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_extracted.csv")
|
||||
.build());
|
||||
headers.setContentType(MediaType.parseMediaType("text/csv"));
|
||||
|
||||
Reference in New Issue
Block a user