fix: switch to pdftohtml for pdf to html conversions (#998)
* fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml
This commit is contained in:
@@ -244,6 +244,6 @@ public class EndpointConfiguration {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static final String REMOVE_BLANKS = "remove-blanks";
|
||||
}
|
||||
|
||||
@@ -291,6 +291,6 @@ public class UserController {
|
||||
}
|
||||
return ResponseEntity.ok(apiKey);
|
||||
}
|
||||
|
||||
|
||||
private static final String LOGIN_MESSAGETYPE_CREDSUPDATED = "/login?messageType=credsUpdated";
|
||||
}
|
||||
|
||||
@@ -29,18 +29,6 @@ import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
@Tag(name = "Convert", description = "Convert APIs")
|
||||
public class ConvertPDFToOffice {
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/pdf/html")
|
||||
@Operation(
|
||||
summary = "Convert PDF to HTML",
|
||||
description =
|
||||
"This endpoint converts a PDF file to HTML format. Input:PDF Output:HTML Type:SISO")
|
||||
public ResponseEntity<byte[]> processPdfToHTML(@ModelAttribute PDFFile request)
|
||||
throws Exception {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
PDFToFile pdfToFile = new PDFToFile();
|
||||
return pdfToFile.processPdfToOfficeFormat(inputFile, "html", "writer_pdf_import");
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
|
||||
@Operation(
|
||||
summary = "Convert PDF to Presentation format",
|
||||
|
||||
@@ -6,8 +6,6 @@ import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
|
||||
@@ -219,6 +219,6 @@ public class ExtractImageScansController {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static final String REPLACEFIRST = "[.][^.]+$";
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import stirling.software.SPDF.model.ApplicationProperties;
|
||||
import stirling.software.SPDF.model.PipelineConfig;
|
||||
import stirling.software.SPDF.model.PipelineOperation;
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@ package stirling.software.SPDF.repository;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.Query;
|
||||
import org.springframework.data.repository.query.Param;
|
||||
|
||||
import stirling.software.SPDF.model.User;
|
||||
|
||||
|
||||
@@ -25,6 +25,71 @@ import io.github.pixee.security.Filenames;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
||||
|
||||
public class PDFToFile {
|
||||
|
||||
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
|
||||
throws IOException, InterruptedException {
|
||||
if (!"application/pdf".equals(inputFile.getContentType())) {
|
||||
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
||||
}
|
||||
|
||||
// Get the original PDF file name without the extension
|
||||
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
||||
String pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
||||
|
||||
Path tempInputFile = null;
|
||||
Path tempOutputDir = null;
|
||||
byte[] fileBytes;
|
||||
String fileName = "temp.file";
|
||||
|
||||
try {
|
||||
// Save the uploaded file to a temporary location
|
||||
tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
Files.copy(
|
||||
inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
// Prepare the output directory
|
||||
tempOutputDir = Files.createTempDirectory("output_");
|
||||
|
||||
// Run the pdftohtml command with complex output
|
||||
List<String> command =
|
||||
new ArrayList<>(
|
||||
Arrays.asList(
|
||||
"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
|
||||
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
|
||||
|
||||
// Get output files
|
||||
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
|
||||
|
||||
// Return output files in a ZIP archive
|
||||
fileName = pdfBaseName + "ToHtml.zip";
|
||||
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream);
|
||||
|
||||
for (File outputFile : outputFiles) {
|
||||
ZipEntry entry = new ZipEntry(outputFile.getName());
|
||||
zipOutputStream.putNextEntry(entry);
|
||||
FileInputStream fis = new FileInputStream(outputFile);
|
||||
IOUtils.copy(fis, zipOutputStream);
|
||||
fis.close();
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
|
||||
zipOutputStream.close();
|
||||
fileBytes = byteArrayOutputStream.toByteArray();
|
||||
|
||||
} finally {
|
||||
// Clean up the temporary files
|
||||
if (tempInputFile != null) Files.delete(tempInputFile);
|
||||
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
||||
}
|
||||
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
||||
}
|
||||
|
||||
public ResponseEntity<byte[]> processPdfToOfficeFormat(
|
||||
MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
|
||||
throws IOException, InterruptedException {
|
||||
@@ -39,17 +104,7 @@ public class PDFToFile {
|
||||
|
||||
// Validate output format
|
||||
List<String> allowedFormats =
|
||||
Arrays.asList(
|
||||
"doc",
|
||||
"docx",
|
||||
"odt",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"odp",
|
||||
"rtf",
|
||||
"html",
|
||||
"xml",
|
||||
"txt:Text");
|
||||
Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
|
||||
if (!allowedFormats.contains(outputFormat)) {
|
||||
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ public class ProcessExecutor {
|
||||
|
||||
public enum Processes {
|
||||
LIBRE_OFFICE,
|
||||
PDFTOHTML,
|
||||
OCR_MY_PDF,
|
||||
PYTHON_OPENCV,
|
||||
GHOSTSCRIPT,
|
||||
@@ -45,6 +46,7 @@ public class ProcessExecutor {
|
||||
int semaphoreLimit =
|
||||
switch (key) {
|
||||
case LIBRE_OFFICE -> 1;
|
||||
case PDFTOHTML -> 1;
|
||||
case OCR_MY_PDF -> 2;
|
||||
case PYTHON_OPENCV -> 8;
|
||||
case GHOSTSCRIPT -> 16;
|
||||
@@ -56,6 +58,7 @@ public class ProcessExecutor {
|
||||
long timeoutMinutes =
|
||||
switch (key) {
|
||||
case LIBRE_OFFICE -> 30;
|
||||
case PDFTOHTML -> 5;
|
||||
case OCR_MY_PDF -> 30;
|
||||
case PYTHON_OPENCV -> 30;
|
||||
case GHOSTSCRIPT -> 5;
|
||||
|
||||
Reference in New Issue
Block a user