remove blanks

This commit is contained in:
Anthony Stirling
2023-05-08 12:18:48 +01:00
parent cc919ea614
commit 0b1cdf6a68
5 changed files with 116 additions and 35 deletions

View File

@@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.other;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
@@ -10,38 +11,30 @@ import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.utils.ImageFinder;
import stirling.software.SPDF.utils.ProcessExecutor;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.imageio.ImageIO;
@RestController
public class BlankPageController {
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException {
boolean removeNoText = false;
boolean removeNoTextOrImages = false;
if(processType == 0) {
removeNoText = true;
} else if (processType == 1) {
removeNoTextOrImages = true;
} else if (processType == 2) {
//run OCR
OCRController ocr = new OCRController();
ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false);
removeNoText = true;
}
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
try {
PDDocument document = PDDocument.load(inputFile.getInputStream());
PDPageTree pages = document.getDocumentCatalog().getPages();
PDFTextStripper textStripper = new PDFTextStripper();
List<PDPage> pagesToKeep = new ArrayList<>();
List<Integer> pagesToKeepIndex = new ArrayList<>();
int pageIndex = 0;
for (PDPage page : pages) {
@@ -50,28 +43,40 @@ public class BlankPageController {
textStripper.setEndPage(pageIndex);
String pageText = textStripper.getText(document);
boolean hasText = !pageText.trim().isEmpty();
if (hasText) {
pagesToKeepIndex.add(pageIndex);
System.out.print("page " + pageIndex + " has text");
continue;
}
boolean hasImages = hasImagesOnPage(page);
if (removeNoText && removeNoTextOrImages) {
if (hasText || hasImages) {
pagesToKeep.add(page);
}
} else if (removeNoText) {
if (hasText) {
pagesToKeep.add(page);
}
} else if (removeNoTextOrImages) {
if (hasText && hasImages) {
pagesToKeep.add(page);
}
} else {
pagesToKeep.add(page);
if (hasImages) {
pagesToKeepIndex.add(pageIndex);
System.out.print("page " + pageIndex + " has image");
continue;
}
}
System.out.print(pagesToKeepIndex.size());
PDDocument outputDocument = new PDDocument();
for (PDPage page : pagesToKeep) {
outputDocument.addPage(page);
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (Integer i : pagesToKeepIndex) {
// Create temp file to save the image
Path tempFile = Files.createTempFile("image_", ".png");
// Render image and save as temp file
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
ImageIO.write(image, "png", tempFile.toFile());
List<String> command = new ArrayList<>(Arrays.asList("python3", "/scripts/detect-blank-pages.py", tempFile.toString()));
// Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
//does contain data
if(returnCode ==0) {
outputDocument.addPage(document.getPage(i - 1));
} else {
System.out.print("Found blank page skipping, page #" + i);
}
}
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

View File

@@ -95,4 +95,11 @@ public class OtherWebController {
return "other/repair";
}
@GetMapping("/remove-blanks")
@Hidden
public String removeBlanksForm(Model model) {
model.addAttribute("currentPage", "remove-blanks");
return "other/remove-blanks";
}
}