Fixes and others (#83)
Features ------------- Custom application name via APP_NAME docker env (These next 3 are done with OCRMyPDF) Extra features to OCR for scanned page cleanup (tilt/noise fixing) Adding OCR ability to read and output to text file Added Dedicated PDF/A conversion page Bug fixes -------------- Fix concurrent calls on Libre and OCRMyPDF jbig fix for compressions Fix for compression metadata issues due to forced conversions to PDF/A Other -------- Removal of UK US language and just using "English" due to extra development time Still issue with concurrent files for PDF to image... will fix later sorry
This commit is contained in:
@@ -5,9 +5,7 @@ import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
|
||||
@SpringBootApplication
|
||||
public class SPdfApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(SPdfApplication.class, args);
|
||||
SpringApplication.run(SPdfApplication.class, args);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -11,4 +11,12 @@ public class AppConfig {
|
||||
String version = getClass().getPackage().getImplementationVersion();
|
||||
return (version != null) ? version : "0.3.3";
|
||||
}
|
||||
|
||||
@Bean(name = "appName")
|
||||
public String appName() {
|
||||
String appName = System.getProperty("AppName");
|
||||
if(appName == null)
|
||||
appName = System.getenv("APP_NAME");
|
||||
return (appName != null) ? appName : "Stirling PDF";
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,7 @@ public class Beans implements WebMvcConfigurer {
|
||||
@Bean
|
||||
public LocaleResolver localeResolver() {
|
||||
SessionLocaleResolver slr = new SessionLocaleResolver();
|
||||
slr.setDefaultLocale(Locale.US);
|
||||
slr.setDefaultLocale(Locale.UK);
|
||||
return slr;
|
||||
}
|
||||
|
||||
|
||||
@@ -54,6 +54,9 @@ public class CompressController {
|
||||
command.add("--tesseract-timeout=0");
|
||||
command.add("--optimize");
|
||||
command.add(String.valueOf(optimizeLevel));
|
||||
command.add("--output-type");
|
||||
command.add("pdf");
|
||||
|
||||
|
||||
if (fastWebView != null && fastWebView) {
|
||||
long fileSize = inputFile.getSize();
|
||||
@@ -69,7 +72,7 @@ public class CompressController {
|
||||
command.add(tempInputFile.toString());
|
||||
command.add(tempOutputFile.toString());
|
||||
|
||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the optimized PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
||||
@@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView;
|
||||
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
//import com.spire.pdf.*;
|
||||
import java.util.concurrent.Semaphore;
|
||||
@Controller
|
||||
public class OCRController {
|
||||
|
||||
@@ -41,11 +42,18 @@ public class OCRController {
|
||||
return modelAndView;
|
||||
}
|
||||
|
||||
private final Semaphore semaphore = new Semaphore(2);
|
||||
|
||||
@PostMapping("/ocr-pdf")
|
||||
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
||||
@RequestParam("languages") List<String> selectedLanguages,
|
||||
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
|
||||
@RequestParam(name = "sidecar", required = false) Boolean sidecar,
|
||||
@RequestParam(name = "deskew", required = false) Boolean deskew,
|
||||
@RequestParam(name = "clean", required = false) Boolean clean,
|
||||
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
|
||||
@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {
|
||||
|
||||
|
||||
//--output-type pdfa
|
||||
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
||||
throw new IOException("Please select at least one language.");
|
||||
@@ -58,20 +66,50 @@ public class OCRController {
|
||||
// Prepare the output file path
|
||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
|
||||
// Prepare the output file path
|
||||
Path sidecarTextPath = null;
|
||||
|
||||
// Run OCR Command
|
||||
String languageOption = String.join("+", selectedLanguages);
|
||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
|
||||
tempInputFile.toString(), tempOutputFile.toString()));
|
||||
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
|
||||
|
||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--output-type", "pdf"));
|
||||
|
||||
|
||||
if (sidecar != null && sidecar) {
|
||||
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
||||
command.add("--sidecar");
|
||||
command.add(sidecarFile);
|
||||
command.add(sidecarTextPath.toString());
|
||||
}
|
||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
||||
|
||||
if (deskew != null && deskew) {
|
||||
command.add("--deskew");
|
||||
}
|
||||
if (clean != null && clean) {
|
||||
command.add("--clean");
|
||||
}
|
||||
if (cleanFinal != null && cleanFinal) {
|
||||
command.add("--clean-final");
|
||||
}
|
||||
if (ocrType != null && !ocrType.equals("")) {
|
||||
if("skip-text".equals(ocrType)) {
|
||||
command.add("--skip-text");
|
||||
} else if("force-ocr".equals(ocrType)) {
|
||||
command.add("--force-ocr");
|
||||
} else if("Normal".equals(ocrType)) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
command.addAll(Arrays.asList("--language", languageOption,
|
||||
tempInputFile.toString(), tempOutputFile.toString()));
|
||||
|
||||
//Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the OCR processed PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
||||
|
||||
|
||||
// Clean up the temporary files
|
||||
Files.delete(tempInputFile);
|
||||
// Return the OCR processed PDF as a response
|
||||
@@ -92,9 +130,9 @@ public class OCRController {
|
||||
zipOut.closeEntry();
|
||||
|
||||
// Add text file to the zip
|
||||
ZipEntry txtEntry = new ZipEntry(sidecarFile);
|
||||
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
||||
zipOut.putNextEntry(txtEntry);
|
||||
Files.copy(Paths.get(sidecarFile), zipOut);
|
||||
Files.copy(sidecarTextPath, zipOut);
|
||||
zipOut.closeEntry();
|
||||
}
|
||||
|
||||
@@ -103,7 +141,7 @@ public class OCRController {
|
||||
// Clean up the temporary zip file
|
||||
Files.delete(tempZipFile);
|
||||
Files.delete(tempOutputFile);
|
||||
Files.delete(Paths.get(sidecarFile));
|
||||
Files.delete(sidecarTextPath);
|
||||
|
||||
// Return the zip file containing both the PDF and the text file
|
||||
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
||||
|
||||
@@ -73,7 +73,6 @@ public class ConvertImgPDFController {
|
||||
if (singleImage) {
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
|
||||
headers.setCacheControl("must-revalidate, post-check=0, pre-check=0");
|
||||
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
|
||||
return response;
|
||||
} else {
|
||||
|
||||
@@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup
|
||||
"-o",
|
||||
tempOutputFile.toString(),
|
||||
tempInputFile.toString()));
|
||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the converted PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
package stirling.software.SPDF.controller.converters;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.ui.Model;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import com.itextpdf.xmp.XMPException;
|
||||
|
||||
import stirling.software.SPDF.utils.PdfUtils;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
@Controller
|
||||
public class ConvertPDFToPDFA {
|
||||
|
||||
@GetMapping("/pdf-to-pdfa")
|
||||
public String pdfToPdfAForm(Model model) {
|
||||
model.addAttribute("currentPage", "pdf-to-pdfa");
|
||||
return "convert/pdf-to-pdfa";
|
||||
}
|
||||
|
||||
|
||||
@PostMapping("/pdf-to-pdfa")
|
||||
public ResponseEntity<byte[]> pdfToPdfA(
|
||||
@RequestParam("fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
|
||||
|
||||
|
||||
// Save the uploaded file to a temporary location
|
||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
inputFile.transferTo(tempInputFile.toFile());
|
||||
|
||||
// Prepare the output file path
|
||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
|
||||
// Prepare the OCRmyPDF command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("ocrmypdf");
|
||||
command.add("--skip-text");
|
||||
command.add("--tesseract-timeout=0");
|
||||
command.add("--output-type");
|
||||
command.add("pdfa");
|
||||
command.add(tempInputFile.toString());
|
||||
command.add(tempOutputFile.toString());
|
||||
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the optimized PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
||||
// Clean up the temporary files
|
||||
Files.delete(tempInputFile);
|
||||
Files.delete(tempOutputFile);
|
||||
|
||||
// Return the optimized PDF as a response
|
||||
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.setContentType(MediaType.APPLICATION_PDF);
|
||||
headers.setContentDispositionFormData("attachment", outputFilename);
|
||||
return ResponseEntity.ok().headers(headers).body(pdfBytes);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -1,65 +1,100 @@
|
||||
package stirling.software.SPDF.utils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Semaphore;
|
||||
public class ProcessExecutor {
|
||||
public static int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||
Process process = processBuilder.start();
|
||||
|
||||
public enum Processes {
|
||||
LIBRE_OFFICE,
|
||||
OCR_MY_PDF
|
||||
}
|
||||
|
||||
// Read the error stream and standard output stream concurrently
|
||||
List<String> errorLines = new ArrayList<>();
|
||||
List<String> outputLines = new ArrayList<>();
|
||||
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
||||
|
||||
Thread errorReaderThread = new Thread(() -> {
|
||||
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = errorReader.readLine()) != null) {
|
||||
errorLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
Thread outputReaderThread = new Thread(() -> {
|
||||
try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = outputReader.readLine()) != null) {
|
||||
outputLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
private final Semaphore semaphore;
|
||||
|
||||
errorReaderThread.start();
|
||||
outputReaderThread.start();
|
||||
|
||||
// Wait for the conversion process to complete
|
||||
int exitCode = process.waitFor();
|
||||
|
||||
// Wait for the reader threads to finish
|
||||
errorReaderThread.join();
|
||||
outputReaderThread.join();
|
||||
|
||||
if (outputLines.size() > 0) {
|
||||
String outputMessage = String.join("\n", outputLines);
|
||||
System.out.println("Command output:\n" + outputMessage);
|
||||
}
|
||||
|
||||
if (errorLines.size() > 0) {
|
||||
String errorMessage = String.join("\n", errorLines);
|
||||
System.out.println("Command error output:\n" + errorMessage);
|
||||
if (exitCode != 0) {
|
||||
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
|
||||
}
|
||||
}
|
||||
private ProcessExecutor(int semaphoreLimit) {
|
||||
this.semaphore = new Semaphore(semaphoreLimit);
|
||||
}
|
||||
|
||||
public static ProcessExecutor getInstance(Processes processType) {
|
||||
return instances.computeIfAbsent(processType, key -> {
|
||||
int semaphoreLimit = switch (key) {
|
||||
case LIBRE_OFFICE -> 1;
|
||||
case OCR_MY_PDF -> 2;
|
||||
};
|
||||
return new ProcessExecutor(semaphoreLimit);
|
||||
});
|
||||
}
|
||||
|
||||
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
||||
int exitCode = 1;
|
||||
semaphore.acquire();
|
||||
try {
|
||||
|
||||
System.out.print("Running command: " + String.join(" ", command));
|
||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||
Process process = processBuilder.start();
|
||||
|
||||
// Read the error stream and standard output stream concurrently
|
||||
List<String> errorLines = new ArrayList<>();
|
||||
List<String> outputLines = new ArrayList<>();
|
||||
|
||||
Thread errorReaderThread = new Thread(() -> {
|
||||
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = errorReader.readLine()) != null) {
|
||||
errorLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
Thread outputReaderThread = new Thread(() -> {
|
||||
try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = outputReader.readLine()) != null) {
|
||||
outputLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
errorReaderThread.start();
|
||||
outputReaderThread.start();
|
||||
|
||||
// Wait for the conversion process to complete
|
||||
exitCode = process.waitFor();
|
||||
|
||||
// Wait for the reader threads to finish
|
||||
errorReaderThread.join();
|
||||
outputReaderThread.join();
|
||||
|
||||
if (outputLines.size() > 0) {
|
||||
String outputMessage = String.join("\n", outputLines);
|
||||
System.out.println("Command output:\n" + outputMessage);
|
||||
}
|
||||
|
||||
if (errorLines.size() > 0) {
|
||||
String errorMessage = String.join("\n", errorLines);
|
||||
System.out.println("Command error output:\n" + errorMessage);
|
||||
if (exitCode != 0) {
|
||||
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
semaphore.release();
|
||||
}
|
||||
return exitCode;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user