remove blanks
This commit is contained in:
@@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.other;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
@@ -10,38 +11,30 @@ import org.springframework.web.bind.annotation.RequestPart;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import stirling.software.SPDF.utils.ImageFinder;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
@RestController
|
||||
public class BlankPageController {
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException {
|
||||
boolean removeNoText = false;
|
||||
boolean removeNoTextOrImages = false;
|
||||
|
||||
if(processType == 0) {
|
||||
removeNoText = true;
|
||||
} else if (processType == 1) {
|
||||
removeNoTextOrImages = true;
|
||||
} else if (processType == 2) {
|
||||
//run OCR
|
||||
OCRController ocr = new OCRController();
|
||||
ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false);
|
||||
|
||||
removeNoText = true;
|
||||
}
|
||||
|
||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
|
||||
try {
|
||||
PDDocument document = PDDocument.load(inputFile.getInputStream());
|
||||
PDPageTree pages = document.getDocumentCatalog().getPages();
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
|
||||
List<PDPage> pagesToKeep = new ArrayList<>();
|
||||
List<Integer> pagesToKeepIndex = new ArrayList<>();
|
||||
int pageIndex = 0;
|
||||
|
||||
for (PDPage page : pages) {
|
||||
@@ -50,28 +43,40 @@ public class BlankPageController {
|
||||
textStripper.setEndPage(pageIndex);
|
||||
String pageText = textStripper.getText(document);
|
||||
boolean hasText = !pageText.trim().isEmpty();
|
||||
if (hasText) {
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
System.out.print("page " + pageIndex + " has text");
|
||||
continue;
|
||||
}
|
||||
boolean hasImages = hasImagesOnPage(page);
|
||||
|
||||
if (removeNoText && removeNoTextOrImages) {
|
||||
if (hasText || hasImages) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else if (removeNoText) {
|
||||
if (hasText) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else if (removeNoTextOrImages) {
|
||||
if (hasText && hasImages) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else {
|
||||
pagesToKeep.add(page);
|
||||
if (hasImages) {
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
System.out.print("page " + pageIndex + " has image");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.print(pagesToKeepIndex.size());
|
||||
PDDocument outputDocument = new PDDocument();
|
||||
for (PDPage page : pagesToKeep) {
|
||||
outputDocument.addPage(page);
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
for (Integer i : pagesToKeepIndex) {
|
||||
// Create temp file to save the image
|
||||
Path tempFile = Files.createTempFile("image_", ".png");
|
||||
|
||||
// Render image and save as temp file
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
|
||||
ImageIO.write(image, "png", tempFile.toFile());
|
||||
|
||||
List<String> command = new ArrayList<>(Arrays.asList("python3", "/scripts/detect-blank-pages.py", tempFile.toString()));
|
||||
|
||||
// Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||
|
||||
//does contain data
|
||||
if(returnCode ==0) {
|
||||
outputDocument.addPage(document.getPage(i - 1));
|
||||
} else {
|
||||
System.out.print("Found blank page skipping, page #" + i);
|
||||
}
|
||||
}
|
||||
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
|
||||
@@ -95,4 +95,11 @@ public class OtherWebController {
|
||||
return "other/repair";
|
||||
}
|
||||
|
||||
@GetMapping("/remove-blanks")
|
||||
@Hidden
|
||||
public String removeBlanksForm(Model model) {
|
||||
model.addAttribute("currentPage", "remove-blanks");
|
||||
return "other/remove-blanks";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user