bug fixes and image scan

This commit is contained in:
Anthony Stirling
2023-04-26 13:18:24 +01:00
parent ab4aea315a
commit 4327af5133
13 changed files with 397 additions and 60 deletions

View File

@@ -6,8 +6,6 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
@@ -16,6 +14,7 @@ import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
@Controller
@@ -52,10 +51,7 @@ public class ConvertPDFToPDFA {
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentDispositionFormData("attachment", outputFilename);
return ResponseEntity.ok().headers(headers).body(pdfBytes);
return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
@GetMapping("/pdf-to-pdfa")

View File

@@ -8,8 +8,6 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
@@ -18,6 +16,7 @@ import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
@Controller
@@ -78,10 +77,7 @@ public class CompressController {
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_Optimized.pdf";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentDispositionFormData("attachment", outputFilename);
return ResponseEntity.ok().headers(headers).body(pdfBytes);
return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}

View File

@@ -0,0 +1,145 @@
package stirling.software.SPDF.controller.other;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
@Controller
public class ExtractImageScansController {
private static final Logger logger = LoggerFactory.getLogger(ExtractImageScansController.class);
@GetMapping("/extract-image-scans")
public ModelAndView extractImageScansForm() {
ModelAndView modelAndView = new ModelAndView("other/extract-image-scans");
modelAndView.addObject("currentPage", "extract-image-scans");
return modelAndView;
}
@PostMapping("/extract-image-scans")
public ResponseEntity<byte[]> extractImageScans(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam(name = "angle_threshold", defaultValue = "5") int angleThreshold,
@RequestParam(name = "tolerance", defaultValue = "20") int tolerance,
@RequestParam(name = "min_area", defaultValue = "8000") int minArea,
@RequestParam(name = "min_contour_area", defaultValue = "500") int minContourArea) throws IOException, InterruptedException {
String fileName = inputFile.getOriginalFilename();
String extension = fileName.substring(fileName.lastIndexOf(".") + 1);
List<String> images = new ArrayList<>();
// Check if input file is a PDF
if (extension.equalsIgnoreCase("pdf")) {
// Load PDF document
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(inputFile.getBytes()))) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCount = document.getNumberOfPages();
images = new ArrayList<>();
// Create images of all pages
for (int i = 0; i < pageCount; i++) {
// Create temp file to save the image
Path tempFile = Files.createTempFile("image_", ".png");
// Render image and save as temp file
BufferedImage image = pdfRenderer.renderImageWithDPI(i, 300);
ImageIO.write(image, "png", tempFile.toFile());
// Add temp file path to images list
images.add(tempFile.toString());
}
}
} else {
Path tempInputFile = Files.createTempFile("input_", "." + extension);
Files.copy(inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
// Add input file path to images list
images.add(tempInputFile.toString());
}
List<byte[]> processedImageBytes = new ArrayList<>();
// Process each image
for (int i = 0; i < images.size(); i++) {
Path tempDir = Files.createTempDirectory("openCV_output");
List<String> command = new ArrayList<>(Arrays.asList("python3", "/pythonScripts/split_photos.py", images.get(i), tempDir.toString(), String.valueOf(angleThreshold), String.valueOf(tolerance),String.valueOf(minArea),String.valueOf(minContourArea)));
// Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
// Read the output photos in temp directory
List<Path> tempOutputFiles = Files.list(tempDir).sorted().collect(Collectors.toList());
for (Path tempOutputFile : tempOutputFiles) {
byte[] imageBytes = Files.readAllBytes(tempOutputFile);
processedImageBytes.add(imageBytes);
}
// Clean up the temporary directory
FileUtils.deleteDirectory(tempDir.toFile());
}
// Create zip file if multiple images
if (processedImageBytes.size() > 1) {
String outputZipFilename = fileName.replaceFirst("[.][^.]+$", "") + "_processed.zip";
Path tempZipFile = Files.createTempFile("output_", ".zip");
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
// Add processed images to the zip
for (int i = 0; i < processedImageBytes.size(); i++) {
ZipEntry entry = new ZipEntry(fileName.replaceFirst("[.][^.]+$", "") + "_" + (i+1) + ".png");
zipOut.putNextEntry(entry);
zipOut.write(processedImageBytes.get(i));
zipOut.closeEntry();
}
}
byte[] zipBytes = Files.readAllBytes(tempZipFile);
// Clean up the temporary zip file
Files.delete(tempZipFile);
return PdfUtils.bytesToWebResponse(zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
} else {
// Return the processed image as a response
byte[] imageBytes = processedImageBytes.get(0);
return PdfUtils.bytesToWebResponse(imageBytes, fileName.replaceFirst("[.][^.]+$", "") + ".png", MediaType.IMAGE_PNG);
}
}
}

View File

@@ -18,10 +18,6 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.core.io.Resource;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
@@ -31,13 +27,15 @@ import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.utils.PdfUtils;
@Controller
public class ExtractImagesController {
private static final Logger logger = LoggerFactory.getLogger(ExtractImagesController.class);
@PostMapping("/extract-images")
public ResponseEntity<Resource> extractImages(@RequestParam("fileInput") MultipartFile file, @RequestParam("format") String format) throws IOException {
public ResponseEntity<byte[]> extractImages(@RequestParam("fileInput") MultipartFile file, @RequestParam("format") String format) throws IOException {
System.out.println(System.currentTimeMillis() + "file=" + file.getName() + ", format=" + format);
PDDocument document = PDDocument.load(file.getBytes());
@@ -98,18 +96,8 @@ public class ExtractImagesController {
// Create ByteArrayResource from byte array
byte[] zipContents = baos.toByteArray();
ByteArrayResource resource = new ByteArrayResource(zipContents);
// Set content disposition header to indicate that the response should be
// downloaded as a file
HttpHeaders headers = new HttpHeaders();
headers.setContentLength(zipContents.length);
headers.add(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted-images.zip");
// Return ResponseEntity with ByteArrayResource and headers
return ResponseEntity.status(HttpStatus.OK).headers(headers)
.header("Cache-Control", "no-cache").contentType(MediaType.APPLICATION_OCTET_STREAM).body(resource);
return PdfUtils.boasToWebResponse(baos, file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM);
}
@GetMapping("/extract-images")

View File

@@ -17,7 +17,6 @@ import java.util.zip.ZipOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
@@ -27,6 +26,7 @@ import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
@Controller
@@ -123,8 +123,6 @@ public class OCRController {
// Return the OCR processed PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
HttpHeaders headers = new HttpHeaders();
if (sidecar != null && sidecar) {
// Create a zip file containing both the PDF and the text file
String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
@@ -150,17 +148,13 @@ public class OCRController {
Files.delete(tempZipFile);
Files.delete(tempOutputFile);
Files.delete(sidecarTextPath);
// Return the zip file containing both the PDF and the text file
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
headers.setContentDispositionFormData("attachment", outputZipFilename);
return ResponseEntity.ok().headers(headers).body(zipBytes);
return PdfUtils.bytesToWebResponse(pdfBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
} else {
// Return the OCR processed PDF as a response
Files.delete(tempOutputFile);
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentDispositionFormData("attachment", outputFilename);
return ResponseEntity.ok().headers(headers).body(pdfBytes);
return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}

View File

@@ -15,7 +15,6 @@ import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
@@ -41,8 +40,7 @@ public class PDFToFile {
Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes;
// Prepare response
HttpHeaders headers = new HttpHeaders();
String fileName = "temp.file";
try {
// Save the uploaded file to a temporary location
@@ -60,19 +58,18 @@ public class PDFToFile {
// Get output files
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
if (outputFiles.size() == 1) {
// Return single output file
File outputFile = outputFiles.get(0);
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
if (outputFormat.equals("txt:Text")) {
outputFormat = "txt";
}
headers.setContentDispositionFormData("attachment", pdfBaseName + "." + outputFormat);
fileName = pdfBaseName + "." + outputFormat;
fileBytes = FileUtils.readFileToByteArray(outputFile);
} else {
// Return output files in a ZIP archive
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
headers.setContentDispositionFormData("attachment", pdfBaseName + "To" + outputFormat + ".zip");
fileName = pdfBaseName + "To" + outputFormat + ".zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream);
@@ -96,6 +93,6 @@ public class PDFToFile {
if (tempOutputDir != null)
FileUtils.deleteDirectory(tempOutputDir.toFile());
}
return new ResponseEntity<>(fileBytes, headers, HttpStatus.OK);
return PdfUtils.bytesToWebResponse(fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
}

View File

@@ -8,6 +8,8 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.security.KeyPair;
import java.security.KeyStore;
@@ -43,18 +45,26 @@ public class PdfUtils {
public static ResponseEntity<byte[]> boasToWebResponse(ByteArrayOutputStream baos, String docName) throws IOException {
return PdfUtils.bytesToWebResponse(baos.toByteArray(), docName);
}
public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName) throws IOException {
public static ResponseEntity<byte[]> boasToWebResponse(ByteArrayOutputStream baos, String docName, MediaType mediaType) throws IOException {
return PdfUtils.bytesToWebResponse(baos.toByteArray(), docName, mediaType );
}
public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName, MediaType mediaType ) throws IOException {
// Return the PDF as a response
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentType(mediaType);
headers.setContentLength(bytes.length);
headers.setContentDispositionFormData("attachment", docName);
String encodedDocName = URLEncoder.encode(docName, StandardCharsets.UTF_8.toString()).replaceAll("\\+", "%20");
headers.setContentDispositionFormData("attachment", encodedDocName);
return new ResponseEntity<>(bytes, headers, HttpStatus.OK);
}
public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName) throws IOException {
return bytesToWebResponse(bytes, docName, MediaType.APPLICATION_PDF);
}
public static byte[] convertFromPdf(byte[] inputStream, String imageType, ImageType colorType, boolean singleImage, int DPI) throws IOException, Exception {
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(inputStream))) {

View File

@@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore;
public class ProcessExecutor {
public enum Processes {
LIBRE_OFFICE, OCR_MY_PDF
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV
}
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
@@ -23,6 +23,7 @@ public class ProcessExecutor {
int semaphoreLimit = switch (key) {
case LIBRE_OFFICE -> 1;
case OCR_MY_PDF -> 2;
case PYTHON_OPENCV -> 8;
};
return new ProcessExecutor(semaphoreLimit);
});

View File

@@ -136,7 +136,7 @@ document.addEventListener("DOMContentLoaded", function () {
const contentDispositionHeader = response.headers.get('Content-Disposition');
console.log(contentDispositionHeader)
if (contentDispositionHeader && contentDispositionHeader.indexOf('attachment') !== -1) {
filename = contentDispositionHeader.split('filename=')[1].replace(/"/g, '');
filename = decodeURIComponent(contentDispositionHeader.split('filename=')[1].replace(/"/g, ''));
} else {
// If the Content-Disposition header is not present or does not contain the filename, use a default filename
filename = 'download';

View File

@@ -0,0 +1,28 @@
<!DOCTYPE html>
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
<th:block th:insert="~{fragments/common :: head(title=#{extractImageScans.title})}"></th:block>
<body>
<div id="page-container">
<div id="content-wrap">
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
<br> <br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-6">
<h2 th:text="#{extractImageScans.header}"></h2>
<form id="multiPdfForm" th:action="@{extract-image-scans}" method="post" enctype="multipart/form-data">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='image/*, application/pdf')}"></div>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{extractImageScans.submit}"></button>
</form>
</div>
</div>
</div>
</div>
<div th:insert="~{fragments/footer.html :: footer}"></div>
</div>
</body>
</html>