Merge branch 'stirling-pdf-rewrite' into version-2
This commit is contained in:
19
shared-operations/functions/createSubDocument.js
Normal file
19
shared-operations/functions/createSubDocument.js
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
|
||||
export async function createSubDocument(pdfDoc, pagesToExtractArray) {
|
||||
const subDocument = await PDFDocument.create();
|
||||
|
||||
// Check that array max number is not larger pdf pages number
|
||||
if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
|
||||
throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
|
||||
}
|
||||
|
||||
const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
|
||||
|
||||
for (let i = 0; i < copiedPages.length; i++) {
|
||||
subDocument.addPage(copiedPages[i]);
|
||||
}
|
||||
|
||||
return subDocument.save();
|
||||
}
|
||||
62
shared-operations/functions/detectEmptyPages.js
Normal file
62
shared-operations/functions/detectEmptyPages.js
Normal file
@@ -0,0 +1,62 @@
|
||||
import { getImagesOnPage } from "./getImagesOnPage.js";
|
||||
import PDFJS from 'pdfjs-dist';
|
||||
|
||||
export async function detectEmptyPages(snapshot, whiteThreashold, OpenCV) {
|
||||
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
||||
|
||||
const emptyPages = [];
|
||||
for (let i = 1; i <= pdfDoc.numPages; i++) {
|
||||
const page = await pdfDoc.getPage(i);
|
||||
console.log("Checking page " + i);
|
||||
|
||||
if(!await hasText(page)) {
|
||||
console.log(`Found text on Page ${i}, page is not empty`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!await areImagesBlank(page, whiteThreashold)) {
|
||||
console.log(`Found non white image on Page ${i}, page is not empty`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`Page ${i} is empty.`);
|
||||
emptyPages.push(i - 1);
|
||||
}
|
||||
return emptyPages;
|
||||
|
||||
async function hasText(page) {
|
||||
const textContent = await page.getTextContent();
|
||||
return textContent.items.length === 0;
|
||||
}
|
||||
|
||||
async function areImagesBlank(page, threshold) {
|
||||
const images = await getImagesOnPage(page);
|
||||
for (const image of images) {
|
||||
if(!isImageBlank(image, threshold))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function isImageBlank(image, threshold) {
|
||||
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
|
||||
src.data.set(image.data);
|
||||
// Convert the image to grayscale
|
||||
const gray = new OpenCV.cv.Mat();
|
||||
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
|
||||
|
||||
// Calculate the mean value of the grayscale image
|
||||
const meanValue = OpenCV.cv.mean(gray);
|
||||
|
||||
// Free memory
|
||||
src.delete();
|
||||
gray.delete();
|
||||
|
||||
// Check if the mean value is below the threshold
|
||||
if (meanValue[0] <= threshold) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import { createSubDocument } from './createSubDocument';
|
||||
|
||||
export async function extractPages(snapshot, pagesToExtractArray) {
|
||||
const pdfDoc = await PDFDocument.load(snapshot)
|
||||
@@ -7,20 +8,3 @@ export async function extractPages(snapshot, pagesToExtractArray) {
|
||||
// TODO: invent a better format for pagesToExtractArray and convert it.
|
||||
return createSubDocument(pdfDoc, pagesToExtractArray);
|
||||
};
|
||||
|
||||
export async function createSubDocument(pdfDoc, pagesToExtractArray) {
|
||||
const subDocument = await PDFDocument.create();
|
||||
|
||||
// Check that array max number is not larger pdf pages number
|
||||
if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
|
||||
throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
|
||||
}
|
||||
|
||||
const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
|
||||
|
||||
for (let i = 0; i < copiedPages.length; i++) {
|
||||
subDocument.addPage(copiedPages[i]);
|
||||
}
|
||||
|
||||
return subDocument.save();
|
||||
}
|
||||
14
shared-operations/functions/getImagesOnPage.js
Normal file
14
shared-operations/functions/getImagesOnPage.js
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
import PDFJS from 'pdfjs-dist';
|
||||
|
||||
export async function getImagesOnPage(page) {
|
||||
const ops = await page.getOperatorList();
|
||||
const images = [];
|
||||
for (var j=0; j < ops.fnArray.length; j++) {
|
||||
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
|
||||
const image = page.objs.get(ops.argsArray[j][0]);
|
||||
images.push(image);
|
||||
}
|
||||
}
|
||||
return images;
|
||||
}
|
||||
12
shared-operations/functions/impose.js
Normal file
12
shared-operations/functions/impose.js
Normal file
@@ -0,0 +1,12 @@
|
||||
export async function impose(snapshot, nup, format, pdfcpuWraopper) {
|
||||
return await pdfcpuWraopper.oneToOne([
|
||||
"pdfcpu.wasm",
|
||||
"nup",
|
||||
"-c",
|
||||
"disable",
|
||||
'f:' + format,
|
||||
"/output.pdf",
|
||||
String(nup),
|
||||
"input.pdf",
|
||||
], snapshot);
|
||||
}
|
||||
18
shared-operations/functions/removeBlankPages.js
Normal file
18
shared-operations/functions/removeBlankPages.js
Normal file
@@ -0,0 +1,18 @@
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import { detectEmptyPages } from "./detectEmptyPages.js";
|
||||
|
||||
export async function removeBlankPages(snapshot, whiteThreashold, OpenCV) {
|
||||
|
||||
const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, OpenCV);
|
||||
|
||||
console.log("Empty Pages: ", emptyPages);
|
||||
|
||||
const pdfDoc = await PDFDocument.load(snapshot);
|
||||
|
||||
// Reverse the array before looping in order to keep the indecies at the right pages. E.g. if you delete page 5 page 7 becomes page 6, if you delete page 7 page 5 remains page 5
|
||||
emptyPages.reverse().forEach(pageIndex => {
|
||||
pdfDoc.removePage(pageIndex);
|
||||
})
|
||||
|
||||
return pdfDoc.save();
|
||||
};
|
||||
121
shared-operations/functions/splitOn.js
Normal file
121
shared-operations/functions/splitOn.js
Normal file
@@ -0,0 +1,121 @@
|
||||
import { detectEmptyPages } from "./shared/detectEmptyPages.js";
|
||||
import { getImagesOnPage } from "./shared/getImagesOnPage.js";
|
||||
import { createSubDocument } from "./shared/createSubDocument.js";
|
||||
import PDFJS from 'pdfjs-dist';
|
||||
|
||||
/**
|
||||
* @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Uint16Array} snapshot
|
||||
* @param {SplitType} type
|
||||
* @param {} PDFJS
|
||||
* @param {import('opencv-wasm')} OpenCV
|
||||
* @param {} PDFLib
|
||||
* @returns
|
||||
*/
|
||||
export async function splitOn(snapshot, type, whiteThreashold, OpenCV, PDFLib, jsQR) {
|
||||
|
||||
let splitAtPages = [];
|
||||
|
||||
switch (type) {
|
||||
case "BAR_CODE":
|
||||
// TODO: Implement
|
||||
throw new Error("This split-type has not been implemented yet");
|
||||
break;
|
||||
|
||||
case "QR_CODE":
|
||||
splitAtPages = await getPagesWithQRCode(snapshot);
|
||||
break;
|
||||
|
||||
case "BLANK_PAGE":
|
||||
splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, OpenCV);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error("An invalid split-type was provided.")
|
||||
break;
|
||||
}
|
||||
|
||||
console.log("Split At Pages: ", splitAtPages);
|
||||
|
||||
// Remove detected Pages & Split
|
||||
const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
|
||||
|
||||
const numberOfPages = pdfDoc.getPages().length;
|
||||
|
||||
let pagesArray = [];
|
||||
let splitAfter = splitAtPages.shift();
|
||||
const subDocuments = [];
|
||||
|
||||
for (let i = 0; i < numberOfPages; i++) {
|
||||
console.log(i);
|
||||
if(i == splitAfter) {
|
||||
if(pagesArray.length > 0) {
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
|
||||
pagesArray = [];
|
||||
}
|
||||
splitAfter = splitAtPages.shift();
|
||||
}
|
||||
else { // Skip splitAtPage
|
||||
console.log("PagesArray")
|
||||
pagesArray.push(i);
|
||||
}
|
||||
}
|
||||
if(pagesArray.length > 0) {
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
|
||||
}
|
||||
pagesArray = [];
|
||||
|
||||
return subDocuments;
|
||||
|
||||
async function getPagesWithQRCode(snapshot) {
|
||||
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
||||
|
||||
const pagesWithQR = [];
|
||||
for (let i = 0; i < pdfDoc.numPages; i++) {
|
||||
console.log("Page:", i, "/", pdfDoc.numPages);
|
||||
const page = await pdfDoc.getPage(i + 1);
|
||||
|
||||
const images = await getImagesOnPage(page);
|
||||
console.log("images:", images);
|
||||
for (const image of images) {
|
||||
const data = await checkForQROnImage(image);
|
||||
if(data == "https://github.com/Frooodle/Stirling-PDF") {
|
||||
pagesWithQR.push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(pagesWithQR.length == 0) {
|
||||
console.warn("Could not find any QR Codes in the provided PDF.")
|
||||
}
|
||||
return pagesWithQR;
|
||||
}
|
||||
|
||||
async function checkForQROnImage(image) {
|
||||
// TODO: There is an issue with the jsQR package (The package expects rgba but sometimes we have rgb), and the package seems to be stale, we could create a fork and fix the issue. In the meanwhile we just force rgba:
|
||||
// Check for rgb and convert to rgba
|
||||
|
||||
if(image.data.length == image.width * image.height * 3) {
|
||||
const tmpArray = new Uint8ClampedArray(image.width * image.height * 4);
|
||||
|
||||
// Iterate through the original array and add an alpha channel
|
||||
for (let i = 0, j = 0; i < image.data.length; i += 3, j += 4) {
|
||||
tmpArray[j] = image.data[i]; // Red channel
|
||||
tmpArray[j + 1] = image.data[i + 1]; // Green channel
|
||||
tmpArray[j + 2] = image.data[i + 2]; // Blue channel
|
||||
tmpArray[j + 3] = 255; // Alpha channel (fully opaque)
|
||||
}
|
||||
|
||||
image.data = tmpArray;
|
||||
}
|
||||
|
||||
const code = jsQR(image.data, image.width, image.height);
|
||||
if(code)
|
||||
return code.data;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
};
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
|
||||
import { createSubDocument } from "./extractPages.js";
|
||||
import { createSubDocument } from "./shared/extractPages.js";
|
||||
|
||||
export async function splitPDF(snapshot, splitAfterPageArray) {
|
||||
const pdfDoc = await PDFDocument.load(snapshot)
|
||||
@@ -18,7 +18,7 @@ export async function splitPDF(snapshot, splitAfterPageArray) {
|
||||
splitAfter = splitAfterPageArray.shift();
|
||||
pagesArray = [];
|
||||
}
|
||||
pagesArray.push(i);
|
||||
pagesArray.push(i);
|
||||
}
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray));
|
||||
pagesArray = [];
|
||||
|
||||
Reference in New Issue
Block a user