Fixes and others (#83)

Features ------------- Custom application name via APP_NAME docker env (These next 3 are done with OCRMyPDF) Extra features to OCR for scanned page cleanup (tilt/noise fixing) Adding OCR ability to read and output to text file Added Dedicated PDF/A conversion page Bug fixes -------------- Fix concurrent calls on Libre and OCRMyPDF jbig fix for compressions Fix for compression metadata issues due to forced conversions to PDF/A Other -------- Removal of UK US language and just using "English" due to extra development time Still issue with concurrent files for PDF to image... will fix later sorry
2023-04-01 21:02:54 +01:00
parent 0b4e3de455
commit 6d5dbd9729
23 changed files with 531 additions and 537 deletions
@@ -54,6 +54,9 @@ public class CompressController {
        command.add("--tesseract-timeout=0");
        command.add("--optimize");
        command.add(String.valueOf(optimizeLevel));
+        command.add("--output-type");
+        command.add("pdf");
+

        if (fastWebView != null && fastWebView) {
            long fileSize = inputFile.getSize();
@@ -69,7 +72,7 @@ public class CompressController {
        command.add(tempInputFile.toString());
        command.add(tempOutputFile.toString());

-        int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
+        int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
        
        // Read the optimized PDF file
        byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
@@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView;

 import stirling.software.SPDF.utils.ProcessExecutor;
 //import com.spire.pdf.*;
+import java.util.concurrent.Semaphore;
@Controller
 public class OCRController {

@@ -41,11 +42,18 @@ public class OCRController {
 		return modelAndView;
 	}

+	private final Semaphore semaphore = new Semaphore(2);
+	
 	@PostMapping("/ocr-pdf")
 	public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
 			@RequestParam("languages") List<String> selectedLanguages,
-			@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
+			@RequestParam(name = "sidecar", required = false) Boolean sidecar,
+			@RequestParam(name = "deskew", required = false) Boolean deskew,
+			@RequestParam(name = "clean", required = false) Boolean clean,
+			@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
+			@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {

+	    
 		//--output-type pdfa
 		if (selectedLanguages == null || selectedLanguages.size() < 1) {
 			throw new IOException("Please select at least one language.");
@@ -58,20 +66,50 @@ public class OCRController {
 		// Prepare the output file path
 		Path tempOutputFile = Files.createTempFile("output_", ".pdf");

+		// Prepare the output file path
+        Path sidecarTextPath = null;
+        
 		// Run OCR Command
 	    String languageOption = String.join("+", selectedLanguages);
-	    List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
-	            tempInputFile.toString(), tempOutputFile.toString()));
-	    String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
+	    
+	    List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--output-type", "pdf"));
+	    		
+	    
 	    if (sidecar != null && sidecar) {
+	        sidecarTextPath = Files.createTempFile("sidecar", ".txt");
 	        command.add("--sidecar");
-	        command.add(sidecarFile);
+	        command.add(sidecarTextPath.toString());
 	    }
-	    int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
+	    
+	    if (deskew != null && deskew) {
+            command.add("--deskew");
+	    }
+	    if (clean != null && clean) {
+            command.add("--clean");
+        }
+	    if (cleanFinal != null && cleanFinal) {
+            command.add("--clean-final");
+        }
+	    if (ocrType != null && !ocrType.equals("")) {
+            if("skip-text".equals(ocrType)) {
+                command.add("--skip-text");
+            } else if("force-ocr".equals(ocrType)) {
+                command.add("--force-ocr");
+            } else if("Normal".equals(ocrType)) {
+                
+            }
+        }

+	    command.addAll(Arrays.asList("--language", languageOption,
+	            tempInputFile.toString(), tempOutputFile.toString()));
+	    
+	    //Run CLI command
+	    int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
+        
 		// Read the OCR processed PDF file
 		byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
-
+		
+	    
 		// Clean up the temporary files
 		Files.delete(tempInputFile);
 		// Return the OCR processed PDF as a response
@@ -92,9 +130,9 @@ public class OCRController {
 	            zipOut.closeEntry();

 	            // Add text file to the zip
-	            ZipEntry txtEntry = new ZipEntry(sidecarFile);
+	            ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
 	            zipOut.putNextEntry(txtEntry);
-	            Files.copy(Paths.get(sidecarFile), zipOut);
+	            Files.copy(sidecarTextPath, zipOut);
 	            zipOut.closeEntry();
 	        }

@@ -103,7 +141,7 @@ public class OCRController {
 	        // Clean up the temporary zip file
 	        Files.delete(tempZipFile);
 	        Files.delete(tempOutputFile);
-	        Files.delete(Paths.get(sidecarFile));
+	        Files.delete(sidecarTextPath);
 	        
 	        // Return the zip file containing both the PDF and the text file
 	        headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
@@ -73,7 +73,6 @@ public class ConvertImgPDFController {
        if (singleImage) {
            HttpHeaders headers = new HttpHeaders();
            headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
-            headers.setCacheControl("must-revalidate, post-check=0, pre-check=0");
            ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
            return response;
        } else {
@@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup
            "-o",
            tempOutputFile.toString(),
            tempInputFile.toString()));
-    int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
+    int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);

    // Read the converted PDF file
    byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
@@ -0,0 +1,75 @@
+package stirling.software.SPDF.controller.converters;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.springframework.http.HttpHeaders;
+import org.springframework.http.MediaType;
+import org.springframework.http.ResponseEntity;
+import org.springframework.stereotype.Controller;
+import org.springframework.ui.Model;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.multipart.MultipartFile;
+
+import com.itextpdf.xmp.XMPException;
+
+import stirling.software.SPDF.utils.PdfUtils;
+import stirling.software.SPDF.utils.ProcessExecutor;
+@Controller
+public class ConvertPDFToPDFA {
+
+	@GetMapping("/pdf-to-pdfa")
+    public String pdfToPdfAForm(Model model) {
+        model.addAttribute("currentPage", "pdf-to-pdfa");
+        return "convert/pdf-to-pdfa";
+    }
+
+    
+    @PostMapping("/pdf-to-pdfa")
+    public ResponseEntity<byte[]> pdfToPdfA(
+            @RequestParam("fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
+
+    	
+        // Save the uploaded file to a temporary location
+        Path tempInputFile = Files.createTempFile("input_", ".pdf");
+        inputFile.transferTo(tempInputFile.toFile());
+
+        // Prepare the output file path
+        Path tempOutputFile = Files.createTempFile("output_", ".pdf");
+
+        // Prepare the OCRmyPDF command
+        List<String> command = new ArrayList<>();
+        command.add("ocrmypdf");
+        command.add("--skip-text");
+        command.add("--tesseract-timeout=0");
+        command.add("--output-type");
+        command.add("pdfa");
+        command.add(tempInputFile.toString());
+        command.add(tempOutputFile.toString());
+
+        int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
+        
+        // Read the optimized PDF file
+        byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
+
+        // Clean up the temporary files
+        Files.delete(tempInputFile);
+        Files.delete(tempOutputFile);
+
+        // Return the optimized PDF as a response
+        String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
+        HttpHeaders headers = new HttpHeaders();
+        headers.setContentType(MediaType.APPLICATION_PDF);
+        headers.setContentDispositionFormData("attachment", outputFilename);
+        return ResponseEntity.ok().headers(headers).body(pdfBytes);
+}
+
+
+}