Detect-Empty-Pages node (PDFJS, OpenCV)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
// PDFLib gets importet via index.html script-tag
|
||||
// TODO: OpenCV
|
||||
import * as pdfcpuWraopper from "./wasm/pdfcpu-wrapper-browser.js";
|
||||
|
||||
import { extractPages as dependantExtractPages } from "./functions/extractPages.js";
|
||||
@@ -10,6 +11,7 @@ import { scalePage as dependantScalePage } from './functions/scalePage.js';
|
||||
import { splitPDF as dependantSplitPDF } from './functions/splitPDF.js';
|
||||
import { editMetadata as dependantEditMetadata} from "./functions/editMetadata.js";
|
||||
import { organizePages as dependantOrganizePages} from "./functions/organizePages.js";
|
||||
import { removeBlankPages as dependantRemoveBlankPages} from "./functions/removeBlankPages.js";
|
||||
|
||||
export async function extractPages(snapshot, pagesToExtractArray) {
|
||||
return dependantExtractPages(snapshot, pagesToExtractArray, PDFLib);
|
||||
@@ -45,4 +47,8 @@ export async function editMetadata(snapshot, metadata) {
|
||||
|
||||
export async function organizePages(snapshot, operation, customOrderString) {
|
||||
return dependantOrganizePages(snapshot, operation, customOrderString, PDFLib);
|
||||
}
|
||||
|
||||
export async function removeBlankPages(snapshot, whiteThreashold) {
|
||||
return dependantRemoveBlankPages(snapshot, whiteThreashold, PDFLib, OpenCV);
|
||||
}
|
||||
67
public/functions/removeBlankPages.js
Normal file
67
public/functions/removeBlankPages.js
Normal file
@@ -0,0 +1,67 @@
|
||||
export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
|
||||
|
||||
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
||||
|
||||
const emptyPages = [];
|
||||
for (let i = 1; i <= pdfDoc.numPages; i++) {
|
||||
const page = await pdfDoc.getPage(i);
|
||||
|
||||
if(!await hasText(page)) {
|
||||
console.log("Found text on Page, page is not empty");
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!await areImagesBlank(page, whiteThreashold)) {
|
||||
console.log("Found image on Page, page is not empty");
|
||||
continue;
|
||||
}
|
||||
|
||||
emptyPages.push[i];
|
||||
}
|
||||
|
||||
console.log(emptyPages);
|
||||
|
||||
// TODO: Remove emptyPages using pdflib
|
||||
// return pdf;
|
||||
|
||||
async function areImagesBlank(page, whiteThreashold) {
|
||||
const ops = await page.getOperatorList();
|
||||
|
||||
for (var j=0; j < ops.fnArray.length; j++) {
|
||||
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
|
||||
const image = page.objs.get(ops.argsArray[j][0]);
|
||||
if(image.data) {
|
||||
return isImageBlank(image, whiteThreashold);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async function hasText(page) {
|
||||
const textContent = await page.getTextContent();
|
||||
return textContent.items.length === 0;
|
||||
}
|
||||
|
||||
async function isImageBlank(image, threshold) {
|
||||
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
|
||||
src.data.set(image.data);
|
||||
// Convert the image to grayscale
|
||||
const gray = new OpenCV.cv.Mat();
|
||||
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
|
||||
|
||||
// Calculate the mean value of the grayscale image
|
||||
const meanValue = OpenCV.cv.mean(gray);
|
||||
|
||||
// Free memory
|
||||
src.delete();
|
||||
gray.delete();
|
||||
|
||||
// Check if the mean value is below the threshold
|
||||
if (meanValue[0] <= threshold) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -109,6 +109,12 @@ export async function * traverseOperations(operations, input, Functions) {
|
||||
input.buffer = await Functions.organizePages(input.buffer, operation.values["operation"], operation.values["customOrderString"]);
|
||||
});
|
||||
break;
|
||||
case "removeBlankPages":
|
||||
yield* nToN(input, operation, async (input) => {
|
||||
input.fileName += "_removedBlanks";
|
||||
input.buffer = await Functions.removeBlankPages(input.buffer, operation.values["whiteThreashold"]);
|
||||
});
|
||||
break;
|
||||
default:
|
||||
throw new Error(`${operation.type} not implemented yet.`);
|
||||
break;
|
||||
|
||||
Reference in New Issue
Block a user