restructured project (shared-operations)

2023-11-07 01:40:00 +01:00
parent 7186c6c3e0
commit 47e0092378
36 changed files with 36 additions and 23 deletions
--- a/shared-operations/functions/editMetadata.js
+++ b/shared-operations/functions/editMetadata.js
@@ -0,0 +1,52 @@
+/**
+ * @typedef {Object} Metadata
+ * @property {string | null | undefined} Title - The title of the document.
+ * @property {string | null | undefined} Author - The author of the document.
+ * @property {string | null | undefined} Subject - The subject of the document.
+ * @property {string[] | null | undefined} Keywords - An array of keywords associated with the document.
+ * @property {string | null | undefined} Producer - The producer of the document.
+ * @property {string | null | undefined} Creator - The creator of the document.
+ * @property {Date | null | undefined} CreationDate - The date when the document was created.
+ * @property {Date | null | undefined} ModificationDate - The date when the document was last modified.
+ */
+
+/**
+ * 
+ * @param {Uint16Array} snapshot
+ * @param {Metadata} metadata - Set property to null or "" to clear, undefined properties will be skipped.
+ * @param {import('pdf-lib')} PDFLib
+ * @returns 
+ */
+export async function editMetadata(snapshot, metadata, PDFLib) {
+    // Load the original PDF file
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
+        parseSpeed: PDFLib.ParseSpeeds.Fastest,
+    });
+
+    if(metadata.Title !== undefined)
+        pdfDoc.setTitle(metadata.Title);
+
+    if(metadata.Author !== undefined)
+        pdfDoc.setAuthor(metadata.Author)
+
+    if(metadata.Subject !== undefined)
+        pdfDoc.setSubject(metadata.Subject)
+    
+    if(metadata.Keywords !== undefined)
+        pdfDoc.setKeywords(metadata.Keywords)
+    
+    if(metadata.Producer !== undefined)
+        pdfDoc.setProducer(metadata.Producer)
+
+    if(metadata.Creator !== undefined)
+        pdfDoc.setCreator(metadata.Creator)
+
+    if(metadata.CreationDate !== undefined)
+        pdfDoc.setCreationDate(metadata.CreationDate)
+
+    if(metadata.ModificationDate !== undefined)
+        pdfDoc.setModificationDate(metadata.ModificationDate)
+
+    // Serialize the modified document
+    return pdfDoc.save();
+};
--- a/shared-operations/functions/extractPages.js
+++ b/shared-operations/functions/extractPages.js
@@ -0,0 +1,8 @@
+import { createSubDocument } from "./shared/createSubDocument.js";
+
+export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
+
+    // TODO: invent a better format for pagesToExtractArray and convert it.
+    return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
+};
--- a/shared-operations/functions/impose.js
+++ b/shared-operations/functions/impose.js
@@ -0,0 +1,12 @@
+export async function impose(snapshot, nup, format, pdfcpuWraopper) {
+    return await pdfcpuWraopper.oneToOne([
+            "pdfcpu.wasm",
+            "nup",
+            "-c",
+            "disable",
+            'f:' + format,
+            "/output.pdf",
+            String(nup),
+            "input.pdf",
+        ], snapshot);
+}
--- a/shared-operations/functions/mergePDFs.js
+++ b/shared-operations/functions/mergePDFs.js
@@ -0,0 +1,13 @@
+export const mergePDFs = async (snapshots, PDFLib) => {
+
+    const mergedPdf = await PDFLib.PDFDocument.create(); 
+
+    for (let i = 0; i < snapshots.length; i++) {
+        const pdfToMerge = await PDFLib.PDFDocument.load(snapshots[i]);
+
+        const copiedPages = await mergedPdf.copyPages(pdfToMerge, pdfToMerge.getPageIndices());
+        copiedPages.forEach((page) => mergedPdf.addPage(page));
+    }
+
+    return mergedPdf.save();
+};
--- a/shared-operations/functions/organizePages.js
+++ b/shared-operations/functions/organizePages.js
@@ -0,0 +1,115 @@
+/**
+ * @typedef {"CUSTOM_PAGE_ORDER"|"REVERSE_ORDER"|"DUPLEX_SORT"|"BOOKLET_SORT"|"ODD_EVEN_SPLIT"|"REMOVE_FIRST"|"REMOVE_LAST"|"REMOVE_FIRST_AND_LAST"} OrderOperation
+ */
+
+/**
+ * 
+ * @param {Uint16Array} snapshot
+ * @param {OrderOperation} operation
+ * @param {string} customOrderString
+ * @param {import('pdf-lib')} PDFLib
+ * @returns 
+ */
+export async function organizePages(snapshot, operation, customOrderString, PDFLib) {
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
+    let subDocument = await PDFLib.PDFDocument.create();
+    const copiedPages = await subDocument.copyPages(pdfDoc, pdfDoc.getPageIndices());
+
+
+    const pageCount = pdfDoc.getPages().length;
+
+    switch (operation) {
+        case "CUSTOM_PAGE_ORDER":
+            console.log("Custom Order");
+            const pageOrderArray = parseCustomPageOrder(customOrderString, pageCount);
+            console.log(pageOrderArray);
+
+            const customOrderedPages = pageOrderArray.map((pageIndex) => copiedPages[pageIndex]);
+            customOrderedPages.forEach((page) => subDocument.addPage(page));
+            break;
+        case "REVERSE_ORDER":
+            const reversedPages = [];
+            for (let i = pageCount - 1; i >= 0; i--) {
+                reversedPages.push(copiedPages[i]);
+            }
+            reversedPages.forEach((page) => subDocument.addPage(page));
+            break;
+        case 'DUPLEX_SORT': //TODO: Needs to be checked by someone who knows more about duplex printing.
+            const duplexPages = [];
+            const half = (pageCount + 1) / 2
+            for (let i = 1; i <= half; i++) {
+                duplexPages.push(copiedPages[i - 1]);
+                if (i <= pageCount - half) {
+                    duplexPages.push(copiedPages[pageCount - i]);
+                }
+            }
+            duplexPages.forEach((page) => subDocument.addPage(page));
+            break;
+        case 'BOOKLET_SORT':
+            const bookletPages = [];
+            for (let i = 0; i < pageCount / 2; i++) {
+                bookletPages.push(copiedPages[i]);
+                bookletPages.push(copiedPages[pageCount - i - 1]);
+            }
+            bookletPages.forEach((page) => subDocument.addPage(page));
+            break;
+        case 'ODD_EVEN_SPLIT':
+            const oddPages = [];
+            const evenPages = [];
+            for (let i = 0; i < pageCount; i++) {
+                if (i % 2 === 0) {
+                    evenPages.push(copiedPages[i]);
+                } else {
+                    oddPages.push(copiedPages[i]);
+                }
+            }
+            oddPages.forEach((page) => subDocument.addPage(page));
+            evenPages.forEach((page) => subDocument.addPage(page));
+            break;
+        case 'REMOVE_FIRST':
+            pdfDoc.removePage(0);
+            subDocument = pdfDoc;
+            break;
+        case 'REMOVE_LAST':
+            pdfDoc.removePage(pageCount - 1);
+            subDocument = pdfDoc;
+            break;
+        case 'REMOVE_FIRST_AND_LAST':
+            pdfDoc.removePage(0);
+            pdfDoc.removePage(pageCount - 2);
+            subDocument = pdfDoc;
+            break;
+        default:
+            throw new Error("Operation not supported");
+            break;
+    }
+
+    return subDocument.save();
+};
+
+function parseCustomPageOrder(customOrder, pageCount) {
+    const pageOrderArray = [];
+    const ranges = customOrder.split(',');
+
+    ranges.forEach((range) => {
+        if (range.includes('-')) {
+            const [start, end] = range.split('-').map(Number);
+            for (let i = start; i <= end; i++) {
+                pageOrderArray.push(i - 1);
+            }
+        } else if (range.includes('n')) {
+            const [even, odd] = range.split('n').map(Number);
+            for (let i = 1; i <= pageCount; i++) {
+                if (i % 2 === 0) {
+                    pageOrderArray.push((i * even) - 1);
+                } else {
+                    pageOrderArray.push((i * odd) - 1);
+                }
+            }
+        } else {
+            pageOrderArray.push(Number(range) - 1);
+        }
+    });
+
+    return pageOrderArray;
+}
--- a/shared-operations/functions/removeBlankPages.js
+++ b/shared-operations/functions/removeBlankPages.js
@@ -0,0 +1,17 @@
+import { detectEmptyPages } from "./shared/detectEmptyPages.js";
+
+export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
+    
+    const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
+
+    console.log("Empty Pages: ", emptyPages);
+
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
+
+    // Reverse the array before looping in order to keep the indecies at the right pages. E.g. if you delete page 5 page 7 becomes page 6, if you delete page 7 page 5 remains page 5
+    emptyPages.reverse().forEach(pageIndex => {
+        pdfDoc.removePage(pageIndex);
+    })
+
+    return pdfDoc.save();
+};
--- a/shared-operations/functions/rotatePages.js
+++ b/shared-operations/functions/rotatePages.js
@@ -0,0 +1,16 @@
+export async function rotatePages (snapshot, rotation, PDFLib) {
+    // Load the original PDF file
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
+        parseSpeed: PDFLib.ParseSpeeds.Fastest,
+    });
+
+    const pages = pdfDoc.getPages();
+
+    pages.forEach(page => {
+        // Change page size
+        page.setRotation(PDFLib.degrees(rotation))
+    });
+
+    // Serialize the modified document
+    return pdfDoc.save();
+};
--- a/shared-operations/functions/scaleContent.js
+++ b/shared-operations/functions/scaleContent.js
@@ -0,0 +1,27 @@
+export async function scaleContent(snapshot, scaleFactor, PDFLib) {
+    // Load the original PDF file
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
+        parseSpeed: PDFLib.ParseSpeeds.Fastest,
+    });
+
+    const pages = pdfDoc.getPages();
+
+    pages.forEach(page => {
+        const width = page.getWidth();
+        const height = page.getHeight();
+        
+        // Scale content
+        page.scaleContent(scaleFactor, scaleFactor);
+        const scaled_diff = {
+            width: Math.round(width - scaleFactor * width),
+            height: Math.round(height - scaleFactor * height),
+        };
+
+        // Center content in new page format
+        page.translateContent(Math.round(scaled_diff.width / 2), Math.round(scaled_diff.height / 2));
+
+    });
+
+    // Serialize the modified document
+    return pdfDoc.save();
+};
--- a/shared-operations/functions/scalePage.js
+++ b/shared-operations/functions/scalePage.js
@@ -0,0 +1,29 @@
+export async function scalePage(snapshot, pageSize, PDFLib) {
+    // Load the original PDF file
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
+        parseSpeed: PDFLib.ParseSpeeds.Fastest,
+    });
+
+    const new_size = pageSize;
+
+    const pages = pdfDoc.getPages();
+
+    pages.forEach(page => {
+        // Change page size
+        page.setSize(new_size.width, new_size.height);
+    });
+
+    // Serialize the modified document
+    return pdfDoc.save();
+};
+
+export const PageSize = {
+    a4: {
+        width: 594.96,
+        height: 841.92
+    },
+    letter: {
+        width: 612,
+        height: 792
+    }
+};
--- a/shared-operations/functions/shared/createSubDocument.js
+++ b/shared-operations/functions/shared/createSubDocument.js
@@ -0,0 +1,16 @@
+export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
+    const subDocument = await PDFLib.PDFDocument.create();
+
+    // Check that array max number is not larger pdf pages number
+    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
+        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
+    }
+
+    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
+
+    for (let i = 0; i < copiedPages.length; i++) {
+        subDocument.addPage(copiedPages[i]);
+    }
+
+    return subDocument.save();
+}
--- a/shared-operations/functions/shared/detectEmptyPages.js
+++ b/shared-operations/functions/shared/detectEmptyPages.js
@@ -0,0 +1,61 @@
+import { getImagesOnPage } from "./getImagesOnPage.js";
+
+export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
+    const pdfDoc = await PDFJS.getDocument(snapshot).promise;
+
+    const emptyPages = [];
+    for (let i = 1; i <= pdfDoc.numPages; i++) {
+        const page = await pdfDoc.getPage(i);
+        console.log("Checking page " + i);
+
+        if(!await hasText(page)) {
+            console.log(`Found text on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        if(!await areImagesBlank(page, whiteThreashold)) {
+            console.log(`Found non white image on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        console.log(`Page ${i} is empty.`);
+        emptyPages.push(i - 1);
+    }
+    return emptyPages;
+
+    async function hasText(page) {
+        const textContent = await page.getTextContent();
+        return textContent.items.length === 0;
+    }
+
+    async function areImagesBlank(page, threshold) {
+        const images = await getImagesOnPage(page, PDFJS);
+        for (const image of images) {
+            if(!isImageBlank(image, threshold))
+                return false;
+        }
+        return true;
+    }
+    
+    function isImageBlank(image, threshold) {
+        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
+        src.data.set(image.data);
+        // Convert the image to grayscale
+        const gray = new OpenCV.cv.Mat();
+        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
+    
+        // Calculate the mean value of the grayscale image
+        const meanValue = OpenCV.cv.mean(gray);
+    
+        // Free memory
+        src.delete();
+        gray.delete();
+    
+        // Check if the mean value is below the threshold
+        if (meanValue[0] <= threshold) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+}
--- a/shared-operations/functions/shared/getImagesOnPage.js
+++ b/shared-operations/functions/shared/getImagesOnPage.js
@@ -0,0 +1,11 @@
+export async function getImagesOnPage(page, PDFJS) {
+    const ops = await page.getOperatorList();
+    const images = [];
+    for (var j=0; j < ops.fnArray.length; j++) {
+        if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
+            const image = page.objs.get(ops.argsArray[j][0]);
+            images.push(image);
+        }
+    }
+    return images;
+}
--- a/shared-operations/functions/splitOn.js
+++ b/shared-operations/functions/splitOn.js
@@ -0,0 +1,120 @@
+import { detectEmptyPages } from "./shared/detectEmptyPages.js";
+import { getImagesOnPage } from "./shared/getImagesOnPage.js";
+import { createSubDocument } from "./shared/createSubDocument.js";
+
+/**
+ * @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
+ */
+
+/**
+ * 
+ * @param {Uint16Array} snapshot
+ * @param {SplitType} type
+ * @param {} PDFJS
+ * @param {import('opencv-wasm')} OpenCV
+ * @param {} PDFLib
+ * @returns 
+ */
+export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, jsQR) {
+    
+    let splitAtPages = [];
+
+    switch (type) {
+        case "BAR_CODE":
+            // TODO: Implement
+            throw new Error("This split-type has not been implemented yet");
+            break;
+
+        case "QR_CODE":
+            splitAtPages = await getPagesWithQRCode(snapshot);
+            break;
+
+        case "BLANK_PAGE":
+            splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
+            break;
+    
+        default:
+            throw new Error("An invalid split-type was provided.")
+            break;
+    }
+
+    console.log("Split At Pages: ", splitAtPages);
+
+    // Remove detected Pages & Split
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
+
+    const numberOfPages = pdfDoc.getPages().length;
+
+    let pagesArray = [];
+    let splitAfter = splitAtPages.shift();
+    const subDocuments = [];
+
+    for (let i = 0; i < numberOfPages; i++) {
+        console.log(i);
+        if(i == splitAfter) {
+            if(pagesArray.length > 0) {
+                subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
+                pagesArray = [];
+            }
+            splitAfter = splitAtPages.shift();
+        }
+        else { // Skip splitAtPage
+            console.log("PagesArray")
+            pagesArray.push(i);
+        }
+    }
+    if(pagesArray.length > 0) {
+        subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib)); 
+    }
+    pagesArray = [];
+
+    return subDocuments;
+
+    async function getPagesWithQRCode(snapshot) {
+        const pdfDoc = await PDFJS.getDocument(snapshot).promise;
+
+        const pagesWithQR = [];
+        for (let i = 0; i < pdfDoc.numPages; i++) {
+            console.log("Page:", i, "/", pdfDoc.numPages);
+            const page = await pdfDoc.getPage(i + 1);
+
+            const images = await getImagesOnPage(page, PDFJS);
+            console.log("images:", images);
+            for (const image of images) {
+                const data = await checkForQROnImage(image);
+                if(data == "https://github.com/Frooodle/Stirling-PDF") {
+                    pagesWithQR.push(i);
+                }
+            }
+        }
+        if(pagesWithQR.length == 0) {
+            console.warn("Could not find any QR Codes in the provided PDF.")
+        }
+        return pagesWithQR;
+    }
+
+    async function checkForQROnImage(image) {
+        // TODO: There is an issue with the jsQR package (The package expects rgba but sometimes we have rgb), and the package seems to be stale, we could create a fork and fix the issue. In the meanwhile we just force rgba:
+        // Check for rgb and convert to rgba
+
+        if(image.data.length == image.width * image.height * 3) {
+            const tmpArray = new Uint8ClampedArray(image.width * image.height * 4);
+
+            // Iterate through the original array and add an alpha channel
+            for (let i = 0, j = 0; i < image.data.length; i += 3, j += 4) {
+                tmpArray[j] = image.data[i];     // Red channel
+                tmpArray[j + 1] = image.data[i + 1]; // Green channel
+                tmpArray[j + 2] = image.data[i + 2]; // Blue channel
+                tmpArray[j + 3] = 255;               // Alpha channel (fully opaque)
+            }
+
+            image.data = tmpArray;
+        }
+
+        const code = jsQR(image.data, image.width, image.height);
+        if(code)
+            return code.data;
+        else
+            return null;
+    }
+};
--- a/shared-operations/functions/splitPDF.js
+++ b/shared-operations/functions/splitPDF.js
@@ -0,0 +1,24 @@
+import { createSubDocument } from "./shared/createSubDocument.js";
+
+export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
+
+    const numberOfPages = pdfDoc.getPages().length;
+
+    let pagesArray = [];
+    let splitAfter = splitAfterPageArray.shift();
+    const subDocuments = [];
+
+    for (let i = 0; i < numberOfPages; i++) {
+        if(i > splitAfter && pagesArray.length > 0) {
+            subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
+            splitAfter = splitAfterPageArray.shift();
+            pagesArray = [];
+        }
+        pagesArray.push(i);        
+    }
+    subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
+    pagesArray = [];
+
+    return subDocuments;
+};