Fixes and others (#83)

Features
-------------
Custom application name via APP_NAME docker env
(These next 3 are done with OCRMyPDF)
Extra features to OCR for scanned page cleanup (tilt/noise fixing)
Adding OCR ability to read and output to text file
Added Dedicated PDF/A conversion page 

Bug fixes
--------------
Fix concurrent calls on Libre and OCRMyPDF
jbig fix for compressions
Fix for compression metadata issues due to forced conversions to PDF/A


Other
--------
Removal of UK US language and just using "English" due to extra development time
Still issue with concurrent files for PDF to image... will fix later sorry
This commit is contained in:
Anthony Stirling
2023-04-01 21:02:54 +01:00
committed by GitHub
parent 0b4e3de455
commit 6d5dbd9729
23 changed files with 531 additions and 537 deletions

View File

@@ -54,6 +54,9 @@ public class CompressController {
command.add("--tesseract-timeout=0");
command.add("--optimize");
command.add(String.valueOf(optimizeLevel));
command.add("--output-type");
command.add("pdf");
if (fastWebView != null && fastWebView) {
long fileSize = inputFile.getSize();
@@ -69,7 +72,7 @@ public class CompressController {
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);

View File

@@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.utils.ProcessExecutor;
//import com.spire.pdf.*;
import java.util.concurrent.Semaphore;
@Controller
public class OCRController {
@@ -41,11 +42,18 @@ public class OCRController {
return modelAndView;
}
private final Semaphore semaphore = new Semaphore(2);
@PostMapping("/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("languages") List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
@RequestParam(name = "sidecar", required = false) Boolean sidecar,
@RequestParam(name = "deskew", required = false) Boolean deskew,
@RequestParam(name = "clean", required = false) Boolean clean,
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {
//--output-type pdfa
if (selectedLanguages == null || selectedLanguages.size() < 1) {
throw new IOException("Please select at least one language.");
@@ -58,20 +66,50 @@ public class OCRController {
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the output file path
Path sidecarTextPath = null;
// Run OCR Command
String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--output-type", "pdf"));
if (sidecar != null && sidecar) {
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
command.add("--sidecar");
command.add(sidecarFile);
command.add(sidecarTextPath.toString());
}
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
if (deskew != null && deskew) {
command.add("--deskew");
}
if (clean != null && clean) {
command.add("--clean");
}
if (cleanFinal != null && cleanFinal) {
command.add("--clean-final");
}
if (ocrType != null && !ocrType.equals("")) {
if("skip-text".equals(ocrType)) {
command.add("--skip-text");
} else if("force-ocr".equals(ocrType)) {
command.add("--force-ocr");
} else if("Normal".equals(ocrType)) {
}
}
command.addAll(Arrays.asList("--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
//Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Read the OCR processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
// Return the OCR processed PDF as a response
@@ -92,9 +130,9 @@ public class OCRController {
zipOut.closeEntry();
// Add text file to the zip
ZipEntry txtEntry = new ZipEntry(sidecarFile);
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
zipOut.putNextEntry(txtEntry);
Files.copy(Paths.get(sidecarFile), zipOut);
Files.copy(sidecarTextPath, zipOut);
zipOut.closeEntry();
}
@@ -103,7 +141,7 @@ public class OCRController {
// Clean up the temporary zip file
Files.delete(tempZipFile);
Files.delete(tempOutputFile);
Files.delete(Paths.get(sidecarFile));
Files.delete(sidecarTextPath);
// Return the zip file containing both the PDF and the text file
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);

View File

@@ -73,7 +73,6 @@ public class ConvertImgPDFController {
if (singleImage) {
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
headers.setCacheControl("must-revalidate, post-check=0, pre-check=0");
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
return response;
} else {

View File

@@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup
"-o",
tempOutputFile.toString(),
tempInputFile.toString()));
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);
// Read the converted PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);

View File

@@ -0,0 +1,75 @@
package stirling.software.SPDF.controller.converters;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import com.itextpdf.xmp.XMPException;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
@Controller
public class ConvertPDFToPDFA {
@GetMapping("/pdf-to-pdfa")
public String pdfToPdfAForm(Model model) {
model.addAttribute("currentPage", "pdf-to-pdfa");
return "convert/pdf-to-pdfa";
}
@PostMapping("/pdf-to-pdfa")
public ResponseEntity<byte[]> pdfToPdfA(
@RequestParam("fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile.toFile());
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the OCRmyPDF command
List<String> command = new ArrayList<>();
command.add("ocrmypdf");
command.add("--skip-text");
command.add("--tesseract-timeout=0");
command.add("--output-type");
command.add("pdfa");
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
Files.delete(tempOutputFile);
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentDispositionFormData("attachment", outputFilename);
return ResponseEntity.ok().headers(headers).body(pdfBytes);
}
}