formatting

This commit is contained in:
Anthony Stirling
2023-12-30 19:11:27 +00:00
parent 7b43fca6fc
commit 5f771b7851
155 changed files with 5539 additions and 4767 deletions

View File

@@ -9,6 +9,7 @@ import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.GeneralFile;
import stirling.software.SPDF.utils.FileToPdf;
import stirling.software.SPDF.utils.WebResponseUtils;
@@ -18,35 +19,30 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@RequestMapping("/api/v1/convert")
public class ConvertHtmlToPDF {
@PostMapping(consumes = "multipart/form-data", value = "/html/pdf")
@Operation(
summary = "Convert an HTML or ZIP (containing HTML and CSS) to PDF",
description =
"This endpoint takes an HTML or ZIP file input and converts it to a PDF format.")
public ResponseEntity<byte[]> HtmlToPdf(@ModelAttribute GeneralFile request) throws Exception {
MultipartFile fileInput = request.getFileInput();
@PostMapping(consumes = "multipart/form-data", value = "/html/pdf")
@Operation(
summary = "Convert an HTML or ZIP (containing HTML and CSS) to PDF",
description = "This endpoint takes an HTML or ZIP file input and converts it to a PDF format."
)
public ResponseEntity<byte[]> HtmlToPdf(
@ModelAttribute GeneralFile request)
throws Exception {
MultipartFile fileInput = request.getFileInput();
if (fileInput == null) {
throw new IllegalArgumentException(
"Please provide an HTML or ZIP file for conversion.");
}
if (fileInput == null) {
throw new IllegalArgumentException("Please provide an HTML or ZIP file for conversion.");
}
String originalFilename = fileInput.getOriginalFilename();
if (originalFilename == null
|| (!originalFilename.endsWith(".html") && !originalFilename.endsWith(".zip"))) {
throw new IllegalArgumentException("File must be either .html or .zip format.");
}
byte[] pdfBytes = FileToPdf.convertHtmlToPdf(fileInput.getBytes(), originalFilename);
String originalFilename = fileInput.getOriginalFilename();
if (originalFilename == null || (!originalFilename.endsWith(".html") && !originalFilename.endsWith(".zip"))) {
throw new IllegalArgumentException("File must be either .html or .zip format.");
}byte[] pdfBytes = FileToPdf.convertHtmlToPdf( fileInput.getBytes(), originalFilename);
String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf"; // Remove file extension and append .pdf
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
String outputFilename =
originalFilename.replaceFirst("[.][^.]+$", "")
+ ".pdf"; // Remove file extension and append .pdf
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}

View File

@@ -20,6 +20,7 @@ import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.converters.ConvertToImageRequest;
import stirling.software.SPDF.model.api.converters.ConvertToPdfRequest;
import stirling.software.SPDF.utils.PdfUtils;
@@ -33,15 +34,18 @@ public class ConvertImgPDFController {
private static final Logger logger = LoggerFactory.getLogger(ConvertImgPDFController.class);
@PostMapping(consumes = "multipart/form-data", value = "/pdf/img")
@Operation(summary = "Convert PDF to image(s)",
description = "This endpoint converts a PDF file to image(s) with the specified image format, color type, and DPI. Users can choose to get a single image or multiple images. Input:PDF Output:Image Type:SI-Conditional")
public ResponseEntity<Resource> convertToImage(@ModelAttribute ConvertToImageRequest request) throws IOException {
@Operation(
summary = "Convert PDF to image(s)",
description =
"This endpoint converts a PDF file to image(s) with the specified image format, color type, and DPI. Users can choose to get a single image or multiple images. Input:PDF Output:Image Type:SI-Conditional")
public ResponseEntity<Resource> convertToImage(@ModelAttribute ConvertToImageRequest request)
throws IOException {
MultipartFile file = request.getFileInput();
String imageFormat = request.getImageFormat();
String singleOrMultiple = request.getSingleOrMultiple();
String colorType = request.getColorType();
String dpi = request.getDpi();
byte[] pdfBytes = file.getBytes();
ImageType colorTypeResult = ImageType.RGB;
if ("greyscale".equals(colorType)) {
@@ -54,7 +58,14 @@ public class ConvertImgPDFController {
byte[] result = null;
String filename = file.getOriginalFilename().replaceFirst("[.][^.]+$", "");
try {
result = PdfUtils.convertFromPdf(pdfBytes, imageFormat.toUpperCase(), colorTypeResult, singleImage, Integer.valueOf(dpi), filename);
result =
PdfUtils.convertFromPdf(
pdfBytes,
imageFormat.toUpperCase(),
colorTypeResult,
singleImage,
Integer.valueOf(dpi),
filename);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
@@ -65,29 +76,39 @@ public class ConvertImgPDFController {
if (singleImage) {
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
ResponseEntity<Resource> response =
new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
return response;
} else {
ByteArrayResource resource = new ByteArrayResource(result);
// return the Resource in the response
return ResponseEntity.ok()
.header(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + filename + "_convertedToImages.zip")
.contentType(MediaType.APPLICATION_OCTET_STREAM).contentLength(resource.contentLength()).body(resource);
.header(
HttpHeaders.CONTENT_DISPOSITION,
"attachment; filename=" + filename + "_convertedToImages.zip")
.contentType(MediaType.APPLICATION_OCTET_STREAM)
.contentLength(resource.contentLength())
.body(resource);
}
}
@PostMapping(consumes = "multipart/form-data", value = "/img/pdf")
@Operation(summary = "Convert images to a PDF file",
description = "This endpoint converts one or more images to a PDF file. Users can specify whether to stretch the images to fit the PDF page, and whether to automatically rotate the images. Input:Image Output:PDF Type:SISO?")
public ResponseEntity<byte[]> convertToPdf(@ModelAttribute ConvertToPdfRequest request) throws IOException {
@Operation(
summary = "Convert images to a PDF file",
description =
"This endpoint converts one or more images to a PDF file. Users can specify whether to stretch the images to fit the PDF page, and whether to automatically rotate the images. Input:Image Output:PDF Type:SISO?")
public ResponseEntity<byte[]> convertToPdf(@ModelAttribute ConvertToPdfRequest request)
throws IOException {
MultipartFile[] file = request.getFileInput();
String fitOption = request.getFitOption();
String colorType = request.getColorType();
boolean autoRotate = request.isAutoRotate();
// Convert the file to PDF and get the resulting bytes
byte[] bytes = PdfUtils.imageToPdf(file, fitOption, autoRotate, colorType);
return WebResponseUtils.bytesToWebResponse(bytes, file[0].getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_converted.pdf");
return WebResponseUtils.bytesToWebResponse(
bytes,
file[0].getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_converted.pdf");
}
private String getMediaType(String imageFormat) {

View File

@@ -12,6 +12,7 @@ import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.GeneralFile;
import stirling.software.SPDF.utils.FileToPdf;
import stirling.software.SPDF.utils.WebResponseUtils;
@@ -20,17 +21,16 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs")
@RequestMapping("/api/v1/convert")
public class ConvertMarkdownToPdf {
@PostMapping(consumes = "multipart/form-data", value = "/markdown/pdf")
@PostMapping(consumes = "multipart/form-data", value = "/markdown/pdf")
@Operation(
summary = "Convert a Markdown file to PDF",
description = "This endpoint takes a Markdown file input, converts it to HTML, and then to PDF format."
)
public ResponseEntity<byte[]> markdownToPdf(
@ModelAttribute GeneralFile request)
throws Exception {
MultipartFile fileInput = request.getFileInput();
summary = "Convert a Markdown file to PDF",
description =
"This endpoint takes a Markdown file input, converts it to HTML, and then to PDF format.")
public ResponseEntity<byte[]> markdownToPdf(@ModelAttribute GeneralFile request)
throws Exception {
MultipartFile fileInput = request.getFileInput();
if (fileInput == null) {
throw new IllegalArgumentException("Please provide a Markdown file for conversion.");
}
@@ -45,10 +45,12 @@ public class ConvertMarkdownToPdf {
Node document = parser.parse(new String(fileInput.getBytes()));
HtmlRenderer renderer = HtmlRenderer.builder().build();
String htmlContent = renderer.render(document);
byte[] pdfBytes = FileToPdf.convertHtmlToPdf(htmlContent.getBytes(), "converted.html");
String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf"; // Remove file extension and append .pdf
byte[] pdfBytes = FileToPdf.convertHtmlToPdf(htmlContent.getBytes(), "converted.html");
String outputFilename =
originalFilename.replaceFirst("[.][^.]+$", "")
+ ".pdf"; // Remove file extension and append .pdf
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}

View File

@@ -18,6 +18,7 @@ import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.GeneralFile;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
@@ -31,20 +32,33 @@ public class ConvertOfficeController {
public byte[] convertToPdf(MultipartFile inputFile) throws IOException, InterruptedException {
// Check for valid file extension
String originalFilename = inputFile.getOriginalFilename();
if (originalFilename == null || !isValidFileExtension(FilenameUtils.getExtension(originalFilename))) {
if (originalFilename == null
|| !isValidFileExtension(FilenameUtils.getExtension(originalFilename))) {
throw new IllegalArgumentException("Invalid file extension");
}
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", "." + FilenameUtils.getExtension(originalFilename));
Path tempInputFile =
Files.createTempFile("input_", "." + FilenameUtils.getExtension(originalFilename));
Files.copy(inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Run the LibreOffice command
List<String> command = new ArrayList<>(Arrays.asList("unoconv", "-vvv", "-f", "pdf", "-o", tempOutputFile.toString(), tempInputFile.toString()));
ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);
List<String> command =
new ArrayList<>(
Arrays.asList(
"unoconv",
"-vvv",
"-f",
"pdf",
"-o",
tempOutputFile.toString(),
tempInputFile.toString()));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
// Read the converted PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
@@ -55,6 +69,7 @@ public class ConvertOfficeController {
return pdfBytes;
}
private boolean isValidFileExtension(String fileExtension) {
String extensionPattern = "^(?i)[a-z0-9]{2,4}$";
return fileExtension.matches(extensionPattern);
@@ -62,17 +77,19 @@ public class ConvertOfficeController {
@PostMapping(consumes = "multipart/form-data", value = "/file/pdf")
@Operation(
summary = "Convert a file to a PDF using LibreOffice",
description = "This endpoint converts a given file to a PDF using LibreOffice API Input:Any Output:PDF Type:SISO"
)
public ResponseEntity<byte[]> processFileToPDF(@ModelAttribute GeneralFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
summary = "Convert a file to a PDF using LibreOffice",
description =
"This endpoint converts a given file to a PDF using LibreOffice API Input:Any Output:PDF Type:SISO")
public ResponseEntity<byte[]> processFileToPDF(@ModelAttribute GeneralFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
// unused but can start server instance if startup time is to long
// LibreOfficeListener.getInstance().start();
byte[] pdfByteArray = convertToPdf(inputFile);
return WebResponseUtils.bytesToWebResponse(pdfByteArray, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_convertedToPDF.pdf");
return WebResponseUtils.bytesToWebResponse(
pdfByteArray,
inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "")
+ "_convertedToPDF.pdf");
}
}

View File

@@ -11,6 +11,7 @@ import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.model.api.converters.PdfToPresentationRequest;
import stirling.software.SPDF.model.api.converters.PdfToTextOrRTFRequest;
@@ -22,51 +23,70 @@ import stirling.software.SPDF.utils.PDFToFile;
@Tag(name = "Convert", description = "Convert APIs")
public class ConvertPDFToOffice {
@PostMapping(consumes = "multipart/form-data", value = "/pdf/html")
@Operation(summary = "Convert PDF to HTML", description = "This endpoint converts a PDF file to HTML format. Input:PDF Output:HTML Type:SISO")
public ResponseEntity<byte[]> processPdfToHTML(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, "html", "writer_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/html")
@Operation(
summary = "Convert PDF to HTML",
description =
"This endpoint converts a PDF file to HTML format. Input:PDF Output:HTML Type:SISO")
public ResponseEntity<byte[]> processPdfToHTML(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, "html", "writer_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
@Operation(summary = "Convert PDF to Presentation format", description = "This endpoint converts a given PDF file to a Presentation format. Input:PDF Output:PPT Type:SISO")
public ResponseEntity<byte[]> processPdfToPresentation(@ModelAttribute PdfToPresentationRequest request) throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "impress_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
@Operation(
summary = "Convert PDF to Presentation format",
description =
"This endpoint converts a given PDF file to a Presentation format. Input:PDF Output:PPT Type:SISO")
public ResponseEntity<byte[]> processPdfToPresentation(
@ModelAttribute PdfToPresentationRequest request)
throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "impress_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/text")
@Operation(summary = "Convert PDF to Text or RTF format", description = "This endpoint converts a given PDF file to Text or RTF format. Input:PDF Output:TXT Type:SISO")
public ResponseEntity<byte[]> processPdfToRTForTXT(@ModelAttribute PdfToTextOrRTFRequest request) throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
@PostMapping(consumes = "multipart/form-data", value = "/pdf/text")
@Operation(
summary = "Convert PDF to Text or RTF format",
description =
"This endpoint converts a given PDF file to Text or RTF format. Input:PDF Output:TXT Type:SISO")
public ResponseEntity<byte[]> processPdfToRTForTXT(
@ModelAttribute PdfToTextOrRTFRequest request)
throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import");
}
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/word")
@Operation(summary = "Convert PDF to Word document", description = "This endpoint converts a given PDF file to a Word document format. Input:PDF Output:WORD Type:SISO")
public ResponseEntity<byte[]> processPdfToWord(@ModelAttribute PdfToWordRequest request) throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/word")
@Operation(
summary = "Convert PDF to Word document",
description =
"This endpoint converts a given PDF file to a Word document format. Input:PDF Output:WORD Type:SISO")
public ResponseEntity<byte[]> processPdfToWord(@ModelAttribute PdfToWordRequest request)
throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/xml")
@Operation(summary = "Convert PDF to XML", description = "This endpoint converts a PDF file to an XML file. Input:PDF Output:XML Type:SISO")
public ResponseEntity<byte[]> processPdfToXML(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, "xml", "writer_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/xml")
@Operation(
summary = "Convert PDF to XML",
description =
"This endpoint converts a PDF file to an XML file. Input:PDF Output:XML Type:SISO")
public ResponseEntity<byte[]> processPdfToXML(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, "xml", "writer_pdf_import");
}
}

View File

@@ -14,6 +14,7 @@ import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
@@ -24,14 +25,13 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs")
public class ConvertPDFToPDFA {
@PostMapping(consumes = "multipart/form-data", value = "/pdf/pdfa")
@Operation(
summary = "Convert a PDF to a PDF/A",
description = "This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO"
)
public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
@PostMapping(consumes = "multipart/form-data", value = "/pdf/pdfa")
@Operation(
summary = "Convert a PDF to a PDF/A",
description =
"This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PDFFile request) throws Exception {
MultipartFile inputFile = request.getFileInput();
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
@@ -50,7 +50,9 @@ public class ConvertPDFToPDFA {
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
@@ -60,8 +62,8 @@ public class ConvertPDFToPDFA {
Files.delete(tempOutputFile);
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
String outputFilename =
inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}

View File

@@ -14,6 +14,7 @@ import org.springframework.web.bind.annotation.RestController;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.converters.UrlToPdfRequest;
import stirling.software.SPDF.utils.GeneralUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
@@ -25,52 +26,52 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@RequestMapping("/api/v1/convert")
public class ConvertWebsiteToPDF {
@PostMapping(consumes = "multipart/form-data", value = "/url/pdf")
@Operation(
summary = "Convert a URL to a PDF",
description = "This endpoint fetches content from a URL and converts it to a PDF format. Input:N/A Output:PDF Type:SISO"
)
public ResponseEntity<byte[]> urlToPdf(@ModelAttribute UrlToPdfRequest request) throws IOException, InterruptedException {
String URL = request.getUrlInput();
@PostMapping(consumes = "multipart/form-data", value = "/url/pdf")
@Operation(
summary = "Convert a URL to a PDF",
description =
"This endpoint fetches content from a URL and converts it to a PDF format. Input:N/A Output:PDF Type:SISO")
public ResponseEntity<byte[]> urlToPdf(@ModelAttribute UrlToPdfRequest request)
throws IOException, InterruptedException {
String URL = request.getUrlInput();
// Validate the URL format
if(!URL.matches("^https?://.*") || !GeneralUtils.isValidURL(URL)) {
throw new IllegalArgumentException("Invalid URL format provided.");
}
Path tempOutputFile = null;
byte[] pdfBytes;
try {
// Prepare the output file path
tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the OCRmyPDF command
List<String> command = new ArrayList<>();
command.add("weasyprint");
command.add(URL);
command.add(tempOutputFile.toString());
ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT).runCommandWithOutputHandling(command);
// Read the optimized PDF file
pdfBytes = Files.readAllBytes(tempOutputFile);
}
finally {
// Clean up the temporary files
Files.delete(tempOutputFile);
}
// Convert URL to a safe filename
String outputFilename = convertURLToFileName(URL);
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
// Validate the URL format
if (!URL.matches("^https?://.*") || !GeneralUtils.isValidURL(URL)) {
throw new IllegalArgumentException("Invalid URL format provided.");
}
Path tempOutputFile = null;
byte[] pdfBytes;
try {
// Prepare the output file path
tempOutputFile = Files.createTempFile("output_", ".pdf");
private String convertURLToFileName(String url) {
String safeName = url.replaceAll("[^a-zA-Z0-9]", "_");
if(safeName.length() > 50) {
safeName = safeName.substring(0, 50); // restrict to 50 characters
}
return safeName + ".pdf";
}
// Prepare the OCRmyPDF command
List<String> command = new ArrayList<>();
command.add("weasyprint");
command.add(URL);
command.add(tempOutputFile.toString());
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
.runCommandWithOutputHandling(command);
// Read the optimized PDF file
pdfBytes = Files.readAllBytes(tempOutputFile);
} finally {
// Clean up the temporary files
Files.delete(tempOutputFile);
}
// Convert URL to a safe filename
String outputFilename = convertURLToFileName(URL);
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
private String convertURLToFileName(String url) {
String safeName = url.replaceAll("[^a-zA-Z0-9]", "_");
if (safeName.length() > 50) {
safeName = safeName.substring(0, 50); // restrict to 50 characters
}
return safeName + ".pdf";
}
}

View File

@@ -22,6 +22,7 @@ import com.opencsv.CSVWriter;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.controller.api.CropController;
import stirling.software.SPDF.controller.api.strippers.PDFTableStripper;
import stirling.software.SPDF.model.api.extract.PDFFilePage;
@@ -34,21 +35,24 @@ public class ExtractController {
private static final Logger logger = LoggerFactory.getLogger(CropController.class);
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
@Operation(summary = "Extracts a PDF document to csv", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form)
throws Exception {
@Operation(
summary = "Extracts a PDF document to csv",
description =
"This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
ArrayList<String> tableData = new ArrayList<>();
int columnsCount = 0;
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(form.getFileInput().getBytes()))) {
try (PDDocument document =
PDDocument.load(new ByteArrayInputStream(form.getFileInput().getBytes()))) {
final double res = 72; // PDF units are at 72 DPI
PDFTableStripper stripper = new PDFTableStripper();
PDPage pdPage = document.getPage(form.getPageId() - 1);
stripper.extractTable(pdPage);
columnsCount = stripper.getColumns();
for (int c = 0; c < columnsCount; ++c) {
for(int r=0; r<stripper.getRows(); ++r) {
for (int r = 0; r < stripper.getRows(); ++r) {
tableData.add(stripper.getText(r, c));
}
}
@@ -56,26 +60,33 @@ public class ExtractController {
ArrayList<String> notEmptyColumns = new ArrayList<>();
for (String item: tableData) {
if(!item.trim().isEmpty()){
for (String item : tableData) {
if (!item.trim().isEmpty()) {
notEmptyColumns.add(item);
}else{
} else {
columnsCount--;
}
}
List<String> fullTable = notEmptyColumns.stream().map((entity)->
entity.replace('\n',' ').replace('\r',' ').trim().replaceAll("\\s{2,}", "|")).toList();
List<String> fullTable =
notEmptyColumns.stream()
.map(
(entity) ->
entity.replace('\n', ' ')
.replace('\r', ' ')
.trim()
.replaceAll("\\s{2,}", "|"))
.toList();
int rowsCount = fullTable.get(0).split("\\|").length;
ArrayList<String> headersList = getTableHeaders(columnsCount,fullTable);
ArrayList<String> recordList = getRecordsList(rowsCount,fullTable);
if(headersList.size() == 0 && recordList.size() == 0) {
throw new Exception("No table detected, no headers or records found");
ArrayList<String> headersList = getTableHeaders(columnsCount, fullTable);
ArrayList<String> recordList = getRecordsList(rowsCount, fullTable);
if (headersList.size() == 0 && recordList.size() == 0) {
throw new Exception("No table detected, no headers or records found");
}
StringWriter writer = new StringWriter();
try (CSVWriter csvWriter = new CSVWriter(writer)) {
csvWriter.writeNext(headersList.toArray(new String[0]));
@@ -85,35 +96,41 @@ public class ExtractController {
}
HttpHeaders headers = new HttpHeaders();
headers.setContentDisposition(ContentDisposition.builder("attachment").filename(form.getFileInput().getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted.csv").build());
headers.setContentDisposition(
ContentDisposition.builder("attachment")
.filename(
form.getFileInput()
.getOriginalFilename()
.replaceFirst("[.][^.]+$", "")
+ "_extracted.csv")
.build());
headers.setContentType(MediaType.parseMediaType("text/csv"));
return ResponseEntity.ok()
.headers(headers)
.body(writer.toString());
return ResponseEntity.ok().headers(headers).body(writer.toString());
}
private ArrayList<String> getRecordsList( int rowsCounts ,List<String> items){
private ArrayList<String> getRecordsList(int rowsCounts, List<String> items) {
ArrayList<String> recordsList = new ArrayList<>();
for (int b=1; b<rowsCounts;b++) {
StringBuilder strbldr = new StringBuilder();
for (int b = 1; b < rowsCounts; b++) {
StringBuilder strbldr = new StringBuilder();
for (int i=0;i<items.size();i++){
String[] parts = items.get(i).split("\\|");
strbldr.append(parts[b]);
if (i!= items.size()-1){
strbldr.append("|");
}
for (int i = 0; i < items.size(); i++) {
String[] parts = items.get(i).split("\\|");
strbldr.append(parts[b]);
if (i != items.size() - 1) {
strbldr.append("|");
}
recordsList.add(strbldr.toString());
}
recordsList.add(strbldr.toString());
}
return recordsList;
}
private ArrayList<String> getTableHeaders(int columnsCount, List<String> items){
private ArrayList<String> getTableHeaders(int columnsCount, List<String> items) {
ArrayList<String> resultList = new ArrayList<>();
for (int i=0;i<columnsCount;i++){
for (int i = 0; i < columnsCount; i++) {
String[] parts = items.get(i).split("\\|");
resultList.add(parts[0]);
}