Merge branch 'stirling-pdf-rewrite' into version-2

2023-11-08 02:11:49 +03:00
parent f80297fbcc 55bab60e89
commit 0a43660e55
36 changed files with 22647 additions and 100 deletions
--- a/shared-operations/functions/createSubDocument.js
+++ b/shared-operations/functions/createSubDocument.js
@@ -0,0 +1,19 @@
+
+import { PDFDocument } from 'pdf-lib';
+
+export async function createSubDocument(pdfDoc, pagesToExtractArray) {
+    const subDocument = await PDFDocument.create();
+
+    // Check that array max number is not larger pdf pages number
+    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
+        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
+    }
+
+    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
+
+    for (let i = 0; i < copiedPages.length; i++) {
+        subDocument.addPage(copiedPages[i]);
+    }
+
+    return subDocument.save();
+}
--- a/shared-operations/functions/detectEmptyPages.js
+++ b/shared-operations/functions/detectEmptyPages.js
@@ -0,0 +1,62 @@
+import { getImagesOnPage } from "./getImagesOnPage.js";
+import PDFJS from 'pdfjs-dist';
+
+export async function detectEmptyPages(snapshot, whiteThreashold, OpenCV) {
+    const pdfDoc = await PDFJS.getDocument(snapshot).promise;
+
+    const emptyPages = [];
+    for (let i = 1; i <= pdfDoc.numPages; i++) {
+        const page = await pdfDoc.getPage(i);
+        console.log("Checking page " + i);
+
+        if(!await hasText(page)) {
+            console.log(`Found text on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        if(!await areImagesBlank(page, whiteThreashold)) {
+            console.log(`Found non white image on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        console.log(`Page ${i} is empty.`);
+        emptyPages.push(i - 1);
+    }
+    return emptyPages;
+
+    async function hasText(page) {
+        const textContent = await page.getTextContent();
+        return textContent.items.length === 0;
+    }
+
+    async function areImagesBlank(page, threshold) {
+        const images = await getImagesOnPage(page);
+        for (const image of images) {
+            if(!isImageBlank(image, threshold))
+                return false;
+        }
+        return true;
+    }
+    
+    function isImageBlank(image, threshold) {
+        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
+        src.data.set(image.data);
+        // Convert the image to grayscale
+        const gray = new OpenCV.cv.Mat();
+        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
+    
+        // Calculate the mean value of the grayscale image
+        const meanValue = OpenCV.cv.mean(gray);
+    
+        // Free memory
+        src.delete();
+        gray.delete();
+    
+        // Check if the mean value is below the threshold
+        if (meanValue[0] <= threshold) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+}
--- a/shared-operations/functions/extractPages.js
+++ b/shared-operations/functions/extractPages.js
@@ -1,5 +1,6 @@

 import { PDFDocument } from 'pdf-lib';
+import { createSubDocument } from './createSubDocument';

 export async function extractPages(snapshot, pagesToExtractArray) {
    const pdfDoc = await PDFDocument.load(snapshot)
@@ -7,20 +8,3 @@ export async function extractPages(snapshot, pagesToExtractArray) {
    // TODO: invent a better format for pagesToExtractArray and convert it.
    return createSubDocument(pdfDoc, pagesToExtractArray);
 };
-
-export async function createSubDocument(pdfDoc, pagesToExtractArray) {
-    const subDocument = await PDFDocument.create();
-
-    // Check that array max number is not larger pdf pages number
-    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
-        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
-    }
-
-    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
-
-    for (let i = 0; i < copiedPages.length; i++) {
-        subDocument.addPage(copiedPages[i]);
-    }
-
-    return subDocument.save();
-}
--- a/shared-operations/functions/getImagesOnPage.js
+++ b/shared-operations/functions/getImagesOnPage.js
@@ -0,0 +1,14 @@
+
+import PDFJS from 'pdfjs-dist';
+
+export async function getImagesOnPage(page) {
+    const ops = await page.getOperatorList();
+    const images = [];
+    for (var j=0; j < ops.fnArray.length; j++) {
+        if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
+            const image = page.objs.get(ops.argsArray[j][0]);
+            images.push(image);
+        }
+    }
+    return images;
+}
--- a/shared-operations/functions/impose.js
+++ b/shared-operations/functions/impose.js
@@ -0,0 +1,12 @@
+export async function impose(snapshot, nup, format, pdfcpuWraopper) {
+    return await pdfcpuWraopper.oneToOne([
+            "pdfcpu.wasm",
+            "nup",
+            "-c",
+            "disable",
+            'f:' + format,
+            "/output.pdf",
+            String(nup),
+            "input.pdf",
+        ], snapshot);
+}
--- a/shared-operations/functions/removeBlankPages.js
+++ b/shared-operations/functions/removeBlankPages.js
@@ -0,0 +1,18 @@
+import { PDFDocument } from 'pdf-lib';
+import { detectEmptyPages } from "./detectEmptyPages.js";
+
+export async function removeBlankPages(snapshot, whiteThreashold, OpenCV) {
+    
+    const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, OpenCV);
+
+    console.log("Empty Pages: ", emptyPages);
+
+    const pdfDoc = await PDFDocument.load(snapshot);
+
+    // Reverse the array before looping in order to keep the indecies at the right pages. E.g. if you delete page 5 page 7 becomes page 6, if you delete page 7 page 5 remains page 5
+    emptyPages.reverse().forEach(pageIndex => {
+        pdfDoc.removePage(pageIndex);
+    })
+
+    return pdfDoc.save();
+};
--- a/shared-operations/functions/splitOn.js
+++ b/shared-operations/functions/splitOn.js
@@ -0,0 +1,121 @@
+import { detectEmptyPages } from "./shared/detectEmptyPages.js";
+import { getImagesOnPage } from "./shared/getImagesOnPage.js";
+import { createSubDocument } from "./shared/createSubDocument.js";
+import PDFJS from 'pdfjs-dist';
+
+/**
+ * @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
+ */
+
+/**
+ * 
+ * @param {Uint16Array} snapshot
+ * @param {SplitType} type
+ * @param {} PDFJS
+ * @param {import('opencv-wasm')} OpenCV
+ * @param {} PDFLib
+ * @returns 
+ */
+export async function splitOn(snapshot, type, whiteThreashold, OpenCV, PDFLib, jsQR) {
+    
+    let splitAtPages = [];
+
+    switch (type) {
+        case "BAR_CODE":
+            // TODO: Implement
+            throw new Error("This split-type has not been implemented yet");
+            break;
+
+        case "QR_CODE":
+            splitAtPages = await getPagesWithQRCode(snapshot);
+            break;
+
+        case "BLANK_PAGE":
+            splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, OpenCV);
+            break;
+    
+        default:
+            throw new Error("An invalid split-type was provided.")
+            break;
+    }
+
+    console.log("Split At Pages: ", splitAtPages);
+
+    // Remove detected Pages & Split
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
+
+    const numberOfPages = pdfDoc.getPages().length;
+
+    let pagesArray = [];
+    let splitAfter = splitAtPages.shift();
+    const subDocuments = [];
+
+    for (let i = 0; i < numberOfPages; i++) {
+        console.log(i);
+        if(i == splitAfter) {
+            if(pagesArray.length > 0) {
+                subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
+                pagesArray = [];
+            }
+            splitAfter = splitAtPages.shift();
+        }
+        else { // Skip splitAtPage
+            console.log("PagesArray")
+            pagesArray.push(i);
+        }
+    }
+    if(pagesArray.length > 0) {
+        subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
+    }
+    pagesArray = [];
+
+    return subDocuments;
+
+    async function getPagesWithQRCode(snapshot) {
+        const pdfDoc = await PDFJS.getDocument(snapshot).promise;
+
+        const pagesWithQR = [];
+        for (let i = 0; i < pdfDoc.numPages; i++) {
+            console.log("Page:", i, "/", pdfDoc.numPages);
+            const page = await pdfDoc.getPage(i + 1);
+
+            const images = await getImagesOnPage(page);
+            console.log("images:", images);
+            for (const image of images) {
+                const data = await checkForQROnImage(image);
+                if(data == "https://github.com/Frooodle/Stirling-PDF") {
+                    pagesWithQR.push(i);
+                }
+            }
+        }
+        if(pagesWithQR.length == 0) {
+            console.warn("Could not find any QR Codes in the provided PDF.")
+        }
+        return pagesWithQR;
+    }
+
+    async function checkForQROnImage(image) {
+        // TODO: There is an issue with the jsQR package (The package expects rgba but sometimes we have rgb), and the package seems to be stale, we could create a fork and fix the issue. In the meanwhile we just force rgba:
+        // Check for rgb and convert to rgba
+
+        if(image.data.length == image.width * image.height * 3) {
+            const tmpArray = new Uint8ClampedArray(image.width * image.height * 4);
+
+            // Iterate through the original array and add an alpha channel
+            for (let i = 0, j = 0; i < image.data.length; i += 3, j += 4) {
+                tmpArray[j] = image.data[i];     // Red channel
+                tmpArray[j + 1] = image.data[i + 1]; // Green channel
+                tmpArray[j + 2] = image.data[i + 2]; // Blue channel
+                tmpArray[j + 3] = 255;               // Alpha channel (fully opaque)
+            }
+
+            image.data = tmpArray;
+        }
+
+        const code = jsQR(image.data, image.width, image.height);
+        if(code)
+            return code.data;
+        else
+            return null;
+    }
+};
--- a/shared-operations/functions/splitPDF.js
+++ b/shared-operations/functions/splitPDF.js
@@ -1,7 +1,7 @@

 import { PDFDocument } from 'pdf-lib';

-import { createSubDocument } from "./extractPages.js";
+import { createSubDocument } from "./shared/extractPages.js";

 export async function splitPDF(snapshot, splitAfterPageArray) {
    const pdfDoc = await PDFDocument.load(snapshot)
@@ -18,7 +18,7 @@ export async function splitPDF(snapshot, splitAfterPageArray) {
            splitAfter = splitAfterPageArray.shift();
            pagesArray = [];
        }
-        pagesArray.push(i);        
+        pagesArray.push(i);
    }
    subDocuments.push(await createSubDocument(pdfDoc, pagesArray));
    pagesArray = [];