restructured project (shared-operations)

This commit is contained in:
Felix Kaspar
2023-11-07 01:40:00 +01:00
parent 7186c6c3e0
commit 47e0092378
36 changed files with 36 additions and 23 deletions

View File

@@ -0,0 +1,52 @@
/**
* @typedef {Object} Metadata
* @property {string | null | undefined} Title - The title of the document.
* @property {string | null | undefined} Author - The author of the document.
* @property {string | null | undefined} Subject - The subject of the document.
* @property {string[] | null | undefined} Keywords - An array of keywords associated with the document.
* @property {string | null | undefined} Producer - The producer of the document.
* @property {string | null | undefined} Creator - The creator of the document.
* @property {Date | null | undefined} CreationDate - The date when the document was created.
* @property {Date | null | undefined} ModificationDate - The date when the document was last modified.
*/
/**
*
* @param {Uint16Array} snapshot
* @param {Metadata} metadata - Set property to null or "" to clear, undefined properties will be skipped.
* @param {import('pdf-lib')} PDFLib
* @returns
*/
export async function editMetadata(snapshot, metadata, PDFLib) {
// Load the original PDF file
const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
parseSpeed: PDFLib.ParseSpeeds.Fastest,
});
if(metadata.Title !== undefined)
pdfDoc.setTitle(metadata.Title);
if(metadata.Author !== undefined)
pdfDoc.setAuthor(metadata.Author)
if(metadata.Subject !== undefined)
pdfDoc.setSubject(metadata.Subject)
if(metadata.Keywords !== undefined)
pdfDoc.setKeywords(metadata.Keywords)
if(metadata.Producer !== undefined)
pdfDoc.setProducer(metadata.Producer)
if(metadata.Creator !== undefined)
pdfDoc.setCreator(metadata.Creator)
if(metadata.CreationDate !== undefined)
pdfDoc.setCreationDate(metadata.CreationDate)
if(metadata.ModificationDate !== undefined)
pdfDoc.setModificationDate(metadata.ModificationDate)
// Serialize the modified document
return pdfDoc.save();
};

View File

@@ -0,0 +1,8 @@
import { createSubDocument } from "./shared/createSubDocument.js";
export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
// TODO: invent a better format for pagesToExtractArray and convert it.
return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
};

View File

@@ -0,0 +1,12 @@
export async function impose(snapshot, nup, format, pdfcpuWraopper) {
return await pdfcpuWraopper.oneToOne([
"pdfcpu.wasm",
"nup",
"-c",
"disable",
'f:' + format,
"/output.pdf",
String(nup),
"input.pdf",
], snapshot);
}

View File

@@ -0,0 +1,13 @@
export const mergePDFs = async (snapshots, PDFLib) => {
const mergedPdf = await PDFLib.PDFDocument.create();
for (let i = 0; i < snapshots.length; i++) {
const pdfToMerge = await PDFLib.PDFDocument.load(snapshots[i]);
const copiedPages = await mergedPdf.copyPages(pdfToMerge, pdfToMerge.getPageIndices());
copiedPages.forEach((page) => mergedPdf.addPage(page));
}
return mergedPdf.save();
};

View File

@@ -0,0 +1,115 @@
/**
* @typedef {"CUSTOM_PAGE_ORDER"|"REVERSE_ORDER"|"DUPLEX_SORT"|"BOOKLET_SORT"|"ODD_EVEN_SPLIT"|"REMOVE_FIRST"|"REMOVE_LAST"|"REMOVE_FIRST_AND_LAST"} OrderOperation
*/
/**
*
* @param {Uint16Array} snapshot
* @param {OrderOperation} operation
* @param {string} customOrderString
* @param {import('pdf-lib')} PDFLib
* @returns
*/
export async function organizePages(snapshot, operation, customOrderString, PDFLib) {
const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
let subDocument = await PDFLib.PDFDocument.create();
const copiedPages = await subDocument.copyPages(pdfDoc, pdfDoc.getPageIndices());
const pageCount = pdfDoc.getPages().length;
switch (operation) {
case "CUSTOM_PAGE_ORDER":
console.log("Custom Order");
const pageOrderArray = parseCustomPageOrder(customOrderString, pageCount);
console.log(pageOrderArray);
const customOrderedPages = pageOrderArray.map((pageIndex) => copiedPages[pageIndex]);
customOrderedPages.forEach((page) => subDocument.addPage(page));
break;
case "REVERSE_ORDER":
const reversedPages = [];
for (let i = pageCount - 1; i >= 0; i--) {
reversedPages.push(copiedPages[i]);
}
reversedPages.forEach((page) => subDocument.addPage(page));
break;
case 'DUPLEX_SORT': //TODO: Needs to be checked by someone who knows more about duplex printing.
const duplexPages = [];
const half = (pageCount + 1) / 2
for (let i = 1; i <= half; i++) {
duplexPages.push(copiedPages[i - 1]);
if (i <= pageCount - half) {
duplexPages.push(copiedPages[pageCount - i]);
}
}
duplexPages.forEach((page) => subDocument.addPage(page));
break;
case 'BOOKLET_SORT':
const bookletPages = [];
for (let i = 0; i < pageCount / 2; i++) {
bookletPages.push(copiedPages[i]);
bookletPages.push(copiedPages[pageCount - i - 1]);
}
bookletPages.forEach((page) => subDocument.addPage(page));
break;
case 'ODD_EVEN_SPLIT':
const oddPages = [];
const evenPages = [];
for (let i = 0; i < pageCount; i++) {
if (i % 2 === 0) {
evenPages.push(copiedPages[i]);
} else {
oddPages.push(copiedPages[i]);
}
}
oddPages.forEach((page) => subDocument.addPage(page));
evenPages.forEach((page) => subDocument.addPage(page));
break;
case 'REMOVE_FIRST':
pdfDoc.removePage(0);
subDocument = pdfDoc;
break;
case 'REMOVE_LAST':
pdfDoc.removePage(pageCount - 1);
subDocument = pdfDoc;
break;
case 'REMOVE_FIRST_AND_LAST':
pdfDoc.removePage(0);
pdfDoc.removePage(pageCount - 2);
subDocument = pdfDoc;
break;
default:
throw new Error("Operation not supported");
break;
}
return subDocument.save();
};
function parseCustomPageOrder(customOrder, pageCount) {
const pageOrderArray = [];
const ranges = customOrder.split(',');
ranges.forEach((range) => {
if (range.includes('-')) {
const [start, end] = range.split('-').map(Number);
for (let i = start; i <= end; i++) {
pageOrderArray.push(i - 1);
}
} else if (range.includes('n')) {
const [even, odd] = range.split('n').map(Number);
for (let i = 1; i <= pageCount; i++) {
if (i % 2 === 0) {
pageOrderArray.push((i * even) - 1);
} else {
pageOrderArray.push((i * odd) - 1);
}
}
} else {
pageOrderArray.push(Number(range) - 1);
}
});
return pageOrderArray;
}

View File

@@ -0,0 +1,17 @@
import { detectEmptyPages } from "./shared/detectEmptyPages.js";
export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
console.log("Empty Pages: ", emptyPages);
const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
// Reverse the array before looping in order to keep the indecies at the right pages. E.g. if you delete page 5 page 7 becomes page 6, if you delete page 7 page 5 remains page 5
emptyPages.reverse().forEach(pageIndex => {
pdfDoc.removePage(pageIndex);
})
return pdfDoc.save();
};

View File

@@ -0,0 +1,16 @@
export async function rotatePages (snapshot, rotation, PDFLib) {
// Load the original PDF file
const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
parseSpeed: PDFLib.ParseSpeeds.Fastest,
});
const pages = pdfDoc.getPages();
pages.forEach(page => {
// Change page size
page.setRotation(PDFLib.degrees(rotation))
});
// Serialize the modified document
return pdfDoc.save();
};

View File

@@ -0,0 +1,27 @@
export async function scaleContent(snapshot, scaleFactor, PDFLib) {
// Load the original PDF file
const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
parseSpeed: PDFLib.ParseSpeeds.Fastest,
});
const pages = pdfDoc.getPages();
pages.forEach(page => {
const width = page.getWidth();
const height = page.getHeight();
// Scale content
page.scaleContent(scaleFactor, scaleFactor);
const scaled_diff = {
width: Math.round(width - scaleFactor * width),
height: Math.round(height - scaleFactor * height),
};
// Center content in new page format
page.translateContent(Math.round(scaled_diff.width / 2), Math.round(scaled_diff.height / 2));
});
// Serialize the modified document
return pdfDoc.save();
};

View File

@@ -0,0 +1,29 @@
export async function scalePage(snapshot, pageSize, PDFLib) {
// Load the original PDF file
const pdfDoc = await PDFLib.PDFDocument.load(snapshot, {
parseSpeed: PDFLib.ParseSpeeds.Fastest,
});
const new_size = pageSize;
const pages = pdfDoc.getPages();
pages.forEach(page => {
// Change page size
page.setSize(new_size.width, new_size.height);
});
// Serialize the modified document
return pdfDoc.save();
};
export const PageSize = {
a4: {
width: 594.96,
height: 841.92
},
letter: {
width: 612,
height: 792
}
};

View File

@@ -0,0 +1,16 @@
export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
const subDocument = await PDFLib.PDFDocument.create();
// Check that array max number is not larger pdf pages number
if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
}
const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
for (let i = 0; i < copiedPages.length; i++) {
subDocument.addPage(copiedPages[i]);
}
return subDocument.save();
}

View File

@@ -0,0 +1,61 @@
import { getImagesOnPage } from "./getImagesOnPage.js";
export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
const emptyPages = [];
for (let i = 1; i <= pdfDoc.numPages; i++) {
const page = await pdfDoc.getPage(i);
console.log("Checking page " + i);
if(!await hasText(page)) {
console.log(`Found text on Page ${i}, page is not empty`);
continue;
}
if(!await areImagesBlank(page, whiteThreashold)) {
console.log(`Found non white image on Page ${i}, page is not empty`);
continue;
}
console.log(`Page ${i} is empty.`);
emptyPages.push(i - 1);
}
return emptyPages;
async function hasText(page) {
const textContent = await page.getTextContent();
return textContent.items.length === 0;
}
async function areImagesBlank(page, threshold) {
const images = await getImagesOnPage(page, PDFJS);
for (const image of images) {
if(!isImageBlank(image, threshold))
return false;
}
return true;
}
function isImageBlank(image, threshold) {
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
src.data.set(image.data);
// Convert the image to grayscale
const gray = new OpenCV.cv.Mat();
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
// Calculate the mean value of the grayscale image
const meanValue = OpenCV.cv.mean(gray);
// Free memory
src.delete();
gray.delete();
// Check if the mean value is below the threshold
if (meanValue[0] <= threshold) {
return true;
} else {
return false;
}
}
}

View File

@@ -0,0 +1,11 @@
export async function getImagesOnPage(page, PDFJS) {
const ops = await page.getOperatorList();
const images = [];
for (var j=0; j < ops.fnArray.length; j++) {
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
const image = page.objs.get(ops.argsArray[j][0]);
images.push(image);
}
}
return images;
}

View File

@@ -0,0 +1,120 @@
import { detectEmptyPages } from "./shared/detectEmptyPages.js";
import { getImagesOnPage } from "./shared/getImagesOnPage.js";
import { createSubDocument } from "./shared/createSubDocument.js";
/**
* @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
*/
/**
*
* @param {Uint16Array} snapshot
* @param {SplitType} type
* @param {} PDFJS
* @param {import('opencv-wasm')} OpenCV
* @param {} PDFLib
* @returns
*/
export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, jsQR) {
let splitAtPages = [];
switch (type) {
case "BAR_CODE":
// TODO: Implement
throw new Error("This split-type has not been implemented yet");
break;
case "QR_CODE":
splitAtPages = await getPagesWithQRCode(snapshot);
break;
case "BLANK_PAGE":
splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
break;
default:
throw new Error("An invalid split-type was provided.")
break;
}
console.log("Split At Pages: ", splitAtPages);
// Remove detected Pages & Split
const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
const numberOfPages = pdfDoc.getPages().length;
let pagesArray = [];
let splitAfter = splitAtPages.shift();
const subDocuments = [];
for (let i = 0; i < numberOfPages; i++) {
console.log(i);
if(i == splitAfter) {
if(pagesArray.length > 0) {
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
pagesArray = [];
}
splitAfter = splitAtPages.shift();
}
else { // Skip splitAtPage
console.log("PagesArray")
pagesArray.push(i);
}
}
if(pagesArray.length > 0) {
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
}
pagesArray = [];
return subDocuments;
async function getPagesWithQRCode(snapshot) {
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
const pagesWithQR = [];
for (let i = 0; i < pdfDoc.numPages; i++) {
console.log("Page:", i, "/", pdfDoc.numPages);
const page = await pdfDoc.getPage(i + 1);
const images = await getImagesOnPage(page, PDFJS);
console.log("images:", images);
for (const image of images) {
const data = await checkForQROnImage(image);
if(data == "https://github.com/Frooodle/Stirling-PDF") {
pagesWithQR.push(i);
}
}
}
if(pagesWithQR.length == 0) {
console.warn("Could not find any QR Codes in the provided PDF.")
}
return pagesWithQR;
}
async function checkForQROnImage(image) {
// TODO: There is an issue with the jsQR package (The package expects rgba but sometimes we have rgb), and the package seems to be stale, we could create a fork and fix the issue. In the meanwhile we just force rgba:
// Check for rgb and convert to rgba
if(image.data.length == image.width * image.height * 3) {
const tmpArray = new Uint8ClampedArray(image.width * image.height * 4);
// Iterate through the original array and add an alpha channel
for (let i = 0, j = 0; i < image.data.length; i += 3, j += 4) {
tmpArray[j] = image.data[i]; // Red channel
tmpArray[j + 1] = image.data[i + 1]; // Green channel
tmpArray[j + 2] = image.data[i + 2]; // Blue channel
tmpArray[j + 3] = 255; // Alpha channel (fully opaque)
}
image.data = tmpArray;
}
const code = jsQR(image.data, image.width, image.height);
if(code)
return code.data;
else
return null;
}
};

View File

@@ -0,0 +1,24 @@
import { createSubDocument } from "./shared/createSubDocument.js";
export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
const numberOfPages = pdfDoc.getPages().length;
let pagesArray = [];
let splitAfter = splitAfterPageArray.shift();
const subDocuments = [];
for (let i = 0; i < numberOfPages; i++) {
if(i > splitAfter && pagesArray.length > 0) {
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
splitAfter = splitAfterPageArray.shift();
pagesArray = [];
}
pagesArray.push(i);
}
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
pagesArray = [];
return subDocuments;
};