Migrating shared-operations to TS (WIP)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
|
||||
export async function createSubDocument(pdfDoc, pagesToExtractArray) {
|
||||
export async function createSubDocument(pdfDoc: PDFDocument, pagesToExtractArray: number[]): Promise<Uint8Array> {
|
||||
const subDocument = await PDFDocument.create();
|
||||
|
||||
// Check that array max number is not larger pdf pages number
|
||||
@@ -1,62 +0,0 @@
|
||||
import { getImagesOnPage } from "./getImagesOnPage.js";
|
||||
import PDFJS from 'pdfjs-dist';
|
||||
|
||||
export async function detectEmptyPages(snapshot, whiteThreashold, OpenCV) {
|
||||
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
||||
|
||||
const emptyPages = [];
|
||||
for (let i = 1; i <= pdfDoc.numPages; i++) {
|
||||
const page = await pdfDoc.getPage(i);
|
||||
console.log("Checking page " + i);
|
||||
|
||||
if(!await hasText(page)) {
|
||||
console.log(`Found text on Page ${i}, page is not empty`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!await areImagesBlank(page, whiteThreashold)) {
|
||||
console.log(`Found non white image on Page ${i}, page is not empty`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`Page ${i} is empty.`);
|
||||
emptyPages.push(i - 1);
|
||||
}
|
||||
return emptyPages;
|
||||
|
||||
async function hasText(page) {
|
||||
const textContent = await page.getTextContent();
|
||||
return textContent.items.length === 0;
|
||||
}
|
||||
|
||||
async function areImagesBlank(page, threshold) {
|
||||
const images = await getImagesOnPage(page);
|
||||
for (const image of images) {
|
||||
if(!isImageBlank(image, threshold))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function isImageBlank(image, threshold) {
|
||||
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
|
||||
src.data.set(image.data);
|
||||
// Convert the image to grayscale
|
||||
const gray = new OpenCV.cv.Mat();
|
||||
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
|
||||
|
||||
// Calculate the mean value of the grayscale image
|
||||
const meanValue = OpenCV.cv.mean(gray);
|
||||
|
||||
// Free memory
|
||||
src.delete();
|
||||
gray.delete();
|
||||
|
||||
// Check if the mean value is below the threshold
|
||||
if (meanValue[0] <= threshold) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
50
shared-operations/functions/detectEmptyPages.ts
Normal file
50
shared-operations/functions/detectEmptyPages.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
import { DocumentInitParameters, PDFPageProxy } from "pdfjs-dist/types/src/display/api.js";
|
||||
import PDFJS from 'pdfjs-dist';
|
||||
import { Image } from 'image-js';
|
||||
|
||||
import { getImagesOnPage } from "./getImagesOnPage.js";
|
||||
|
||||
export async function detectEmptyPages(snapshot: string | URL | ArrayBuffer | DocumentInitParameters, whiteThreashold: number) {
|
||||
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
||||
|
||||
const emptyPages: number[] = [];
|
||||
for (let i = 1; i <= pdfDoc.numPages; i++) {
|
||||
const page = await pdfDoc.getPage(i);
|
||||
console.log("Checking page " + i);
|
||||
|
||||
if(!await hasText(page)) {
|
||||
console.log(`Found text on Page ${i}, page is not empty`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!await areImagesBlank(page, whiteThreashold)) {
|
||||
console.log(`Found non white image on Page ${i}, page is not empty`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`Page ${i} is empty.`);
|
||||
emptyPages.push(i - 1);
|
||||
}
|
||||
return emptyPages;
|
||||
}
|
||||
|
||||
async function hasText(page: PDFPageProxy): Promise<boolean> {
|
||||
const textContent = await page.getTextContent();
|
||||
return textContent.items.length === 0;
|
||||
}
|
||||
|
||||
async function areImagesBlank(page: PDFPageProxy, threshold: number): Promise<boolean> {
|
||||
const images = await getImagesOnPage(page);
|
||||
for (const image of images) {
|
||||
if(!await isImageBlank(image, threshold))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async function isImageBlank(image: string | Uint8Array | ArrayBuffer, threshold: number): Promise<boolean> {
|
||||
var img = await Image.load(image);
|
||||
var grey = img.grey();
|
||||
var mean = grey.getMean();
|
||||
return mean[0] <= threshold;
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
|
||||
import { PDFDocument, ParseSpeeds } from 'pdf-lib';
|
||||
|
||||
|
||||
/**
|
||||
* @typedef {Object} Metadata
|
||||
* @property {string | null | undefined} Title - The title of the document.
|
||||
* @property {string | null | undefined} Author - The author of the document.
|
||||
* @property {string | null | undefined} Subject - The subject of the document.
|
||||
* @property {string[] | null | undefined} Keywords - An array of keywords associated with the document.
|
||||
* @property {string | null | undefined} Producer - The producer of the document.
|
||||
* @property {string | null | undefined} Creator - The creator of the document.
|
||||
* @property {Date | null | undefined} CreationDate - The date when the document was created.
|
||||
* @property {Date | null | undefined} ModificationDate - The date when the document was last modified.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Uint16Array} snapshot
|
||||
* @param {Metadata} metadata - Set property to null or "" to clear, undefined properties will be skipped.
|
||||
* @returns Promise<Uint8Array>
|
||||
*/
|
||||
export async function editMetadata(snapshot, metadata) {
|
||||
// Load the original PDF file
|
||||
const pdfDoc = await PDFDocument.load(snapshot, {
|
||||
parseSpeed: ParseSpeeds.Fastest,
|
||||
});
|
||||
|
||||
if(metadata.Title !== undefined)
|
||||
pdfDoc.setTitle(metadata.Title);
|
||||
|
||||
if(metadata.Author !== undefined)
|
||||
pdfDoc.setAuthor(metadata.Author)
|
||||
|
||||
if(metadata.Subject !== undefined)
|
||||
pdfDoc.setSubject(metadata.Subject)
|
||||
|
||||
if(metadata.Keywords !== undefined)
|
||||
pdfDoc.setKeywords(metadata.Keywords)
|
||||
|
||||
if(metadata.Producer !== undefined)
|
||||
pdfDoc.setProducer(metadata.Producer)
|
||||
|
||||
if(metadata.Creator !== undefined)
|
||||
pdfDoc.setCreator(metadata.Creator)
|
||||
|
||||
if(metadata.CreationDate !== undefined)
|
||||
pdfDoc.setCreationDate(metadata.CreationDate)
|
||||
|
||||
if(metadata.ModificationDate !== undefined)
|
||||
pdfDoc.setModificationDate(metadata.ModificationDate)
|
||||
|
||||
// Serialize the modified document
|
||||
return pdfDoc.save();
|
||||
};
|
||||
53
shared-operations/functions/editMetadata.ts
Normal file
53
shared-operations/functions/editMetadata.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
import { PDFDocument, ParseSpeeds } from 'pdf-lib';
|
||||
|
||||
|
||||
export type Metadata = {
|
||||
Title: string | null | undefined; // The title of the document.
|
||||
Author: string | null | undefined; // The author of the document.
|
||||
Subject: string | null | undefined; // The subject of the document.
|
||||
Keywords: string[] | null | undefined; // An array of keywords associated with the document.
|
||||
Producer: string | null | undefined; // The producer of the document.
|
||||
Creator: string | null | undefined; // The creator of the document.
|
||||
CreationDate: Date | null | undefined; // The date when the document was created.
|
||||
ModificationDate: Date | null | undefined; // The date when the document was last modified.
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @param {Uint16Array} snapshot
|
||||
* @param {Metadata} metadata - Set property to null or "" to clear, undefined properties will be skipped.
|
||||
* @returns Promise<Uint8Array>
|
||||
*/
|
||||
export async function editMetadata(snapshot: string | Uint8Array | ArrayBuffer, metadata: Metadata): Promise<Uint8Array> {
|
||||
// Load the original PDF file
|
||||
const pdfDoc = await PDFDocument.load(snapshot, {
|
||||
parseSpeed: ParseSpeeds.Fastest,
|
||||
});
|
||||
|
||||
if(metadata.Title)
|
||||
pdfDoc.setTitle(metadata.Title);
|
||||
|
||||
if(metadata.Author)
|
||||
pdfDoc.setAuthor(metadata.Author)
|
||||
|
||||
if(metadata.Subject)
|
||||
pdfDoc.setSubject(metadata.Subject)
|
||||
|
||||
if(metadata.Keywords)
|
||||
pdfDoc.setKeywords(metadata.Keywords)
|
||||
|
||||
if(metadata.Producer)
|
||||
pdfDoc.setProducer(metadata.Producer)
|
||||
|
||||
if(metadata.Creator)
|
||||
pdfDoc.setCreator(metadata.Creator)
|
||||
|
||||
if(metadata.CreationDate)
|
||||
pdfDoc.setCreationDate(metadata.CreationDate)
|
||||
|
||||
if(metadata.ModificationDate)
|
||||
pdfDoc.setModificationDate(metadata.ModificationDate)
|
||||
|
||||
// Serialize the modified document
|
||||
return pdfDoc.save();
|
||||
};
|
||||
@@ -2,7 +2,7 @@
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import { createSubDocument } from './createSubDocument';
|
||||
|
||||
export async function extractPages(snapshot, pagesToExtractArray) {
|
||||
export async function extractPages(snapshot: string | Uint8Array | ArrayBuffer, pagesToExtractArray: number[]): Promise<Uint8Array>{
|
||||
const pdfDoc = await PDFDocument.load(snapshot)
|
||||
|
||||
// TODO: invent a better format for pagesToExtractArray and convert it.
|
||||
@@ -1,11 +1,12 @@
|
||||
|
||||
import { PDFPageProxy } from "pdfjs-dist/types/src/display/api.js";
|
||||
import PDFJS from 'pdfjs-dist';
|
||||
|
||||
export async function getImagesOnPage(page) {
|
||||
export async function getImagesOnPage(page: PDFPageProxy) {
|
||||
const ops = await page.getOperatorList();
|
||||
const images = [];
|
||||
const images: any = [];
|
||||
for (var j=0; j < ops.fnArray.length; j++) {
|
||||
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
|
||||
if (ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
|
||||
const image = page.objs.get(ops.argsArray[j][0]);
|
||||
images.push(image);
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
|
||||
export const mergePDFs = async (snapshots) => {
|
||||
export async function mergePDFs(snapshots: (string | Uint8Array | ArrayBuffer)[]): Promise<Uint8Array> {
|
||||
|
||||
const mergedPdf = await PDFDocument.create();
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
|
||||
/**
|
||||
* @typedef {"CUSTOM_PAGE_ORDER"|"REVERSE_ORDER"|"DUPLEX_SORT"|"BOOKLET_SORT"|"ODD_EVEN_SPLIT"|"REMOVE_FIRST"|"REMOVE_LAST"|"REMOVE_FIRST_AND_LAST"} OrderOperation
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Uint16Array} snapshot
|
||||
* @param {OrderOperation} operation
|
||||
* @param {string} customOrderString
|
||||
* @param {import('pdf-lib')} PDFLib
|
||||
* @returns
|
||||
*/
|
||||
export async function organizePages(snapshot, operation, customOrderString) {
|
||||
export async function organizePages(
|
||||
snapshot: string | Uint8Array | ArrayBuffer,
|
||||
operation: "CUSTOM_PAGE_ORDER" |
|
||||
"REVERSE_ORDER" |
|
||||
"DUPLEX_SORT" |
|
||||
"BOOKLET_SORT" |
|
||||
"ODD_EVEN_SPLIT" |
|
||||
"REMOVE_FIRST" |
|
||||
"REMOVE_LAST" |
|
||||
"REMOVE_FIRST_AND_LAST",
|
||||
customOrderString: string): Promise<Uint8Array> {
|
||||
const pdfDoc = await PDFDocument.load(snapshot);
|
||||
let subDocument = await PDFDocument.create();
|
||||
const copiedPages = await subDocument.copyPages(pdfDoc, pdfDoc.getPageIndices());
|
||||
@@ -90,8 +88,8 @@ export async function organizePages(snapshot, operation, customOrderString) {
|
||||
return subDocument.save();
|
||||
};
|
||||
|
||||
function parseCustomPageOrder(customOrder, pageCount) {
|
||||
const pageOrderArray = [];
|
||||
function parseCustomPageOrder(customOrder: string, pageCount: number) {
|
||||
const pageOrderArray: number[] = [];
|
||||
const ranges = customOrder.split(',');
|
||||
|
||||
ranges.forEach((range) => {
|
||||
@@ -1,9 +1,9 @@
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import { detectEmptyPages } from "./detectEmptyPages.js";
|
||||
|
||||
export async function removeBlankPages(snapshot, whiteThreashold, OpenCV) {
|
||||
export async function removeBlankPages(snapshot, whiteThreashold) {
|
||||
|
||||
const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, OpenCV);
|
||||
const emptyPages = await detectEmptyPages(snapshot, whiteThreashold);
|
||||
|
||||
console.log("Empty Pages: ", emptyPages);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
import { PDFDocument, ParseSpeeds, degrees } from 'pdf-lib';
|
||||
|
||||
export async function rotatePages(snapshot, rotation) {
|
||||
export async function rotatePages(snapshot: string | Uint8Array | ArrayBuffer, rotation: number): Promise<Uint8Array> {
|
||||
// Load the original PDF file
|
||||
const pdfDoc = await PDFDocument.load(snapshot, {
|
||||
parseSpeed: ParseSpeeds.Fastest,
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
import { PDFDocument, ParseSpeeds } from 'pdf-lib';
|
||||
|
||||
export async function scaleContent(snapshot, scaleFactor) {
|
||||
export async function scaleContent(snapshot: string | Uint8Array | ArrayBuffer, scaleFactor: number): Promise<Uint8Array> {
|
||||
// Load the original PDF file
|
||||
const pdfDoc = await PDFDocument.load(snapshot, {
|
||||
parseSpeed: ParseSpeeds.Fastest,
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
import { PDFDocument, ParseSpeeds } from 'pdf-lib';
|
||||
|
||||
export async function scalePage(snapshot, pageSize) {
|
||||
export async function scalePage(snapshot: string | Uint8Array | ArrayBuffer, pageSize: {width:number,height:number}): Promise<Uint8Array> {
|
||||
// Load the original PDF file
|
||||
const pdfDoc = await PDFDocument.load(snapshot, {
|
||||
parseSpeed: ParseSpeeds.Fastest,
|
||||
@@ -1,60 +1,52 @@
|
||||
import { detectEmptyPages } from "./shared/detectEmptyPages.js";
|
||||
import { getImagesOnPage } from "./shared/getImagesOnPage.js";
|
||||
import { createSubDocument } from "./shared/createSubDocument.js";
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import PDFJS from 'pdfjs-dist';
|
||||
|
||||
/**
|
||||
* @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
|
||||
*/
|
||||
import { detectEmptyPages } from "./detectEmptyPages.js";
|
||||
import { getImagesOnPage } from "./getImagesOnPage.js";
|
||||
import { createSubDocument } from "./createSubDocument.js";
|
||||
import { TypedArray, DocumentInitParameters } from 'pdfjs-dist/types/src/display/api.js';
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Uint16Array} snapshot
|
||||
* @param {SplitType} type
|
||||
* @param {} PDFJS
|
||||
* @param {import('opencv-wasm')} OpenCV
|
||||
* @param {} PDFLib
|
||||
* @returns
|
||||
*/
|
||||
export async function splitOn(snapshot, type, whiteThreashold, OpenCV, PDFLib, jsQR) {
|
||||
|
||||
let splitAtPages = [];
|
||||
export async function splitOn(
|
||||
snapshot: string | ArrayBuffer | Uint8Array,
|
||||
type: "BAR_CODE"|"QR_CODE"|"BLANK_PAGE",
|
||||
whiteThreashold: number,
|
||||
jsQR: (arg0: any, arg1: number, arg2: number) => any) {
|
||||
let splitAtPages: number[] = [];
|
||||
|
||||
switch (type) {
|
||||
case "BAR_CODE":
|
||||
// TODO: Implement
|
||||
throw new Error("This split-type has not been implemented yet");
|
||||
break;
|
||||
|
||||
case "QR_CODE":
|
||||
splitAtPages = await getPagesWithQRCode(snapshot);
|
||||
break;
|
||||
|
||||
case "BLANK_PAGE":
|
||||
splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, OpenCV);
|
||||
splitAtPages = await detectEmptyPages(snapshot, whiteThreashold);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error("An invalid split-type was provided.")
|
||||
break;
|
||||
throw new Error("An invalid split-type was provided.");
|
||||
}
|
||||
|
||||
console.log("Split At Pages: ", splitAtPages);
|
||||
|
||||
// Remove detected Pages & Split
|
||||
const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
|
||||
const pdfDoc = await PDFDocument.load(snapshot);
|
||||
|
||||
const numberOfPages = pdfDoc.getPages().length;
|
||||
|
||||
let pagesArray = [];
|
||||
let pagesArray: number[] = [];
|
||||
let splitAfter = splitAtPages.shift();
|
||||
const subDocuments = [];
|
||||
const subDocuments: Uint8Array[] = [];
|
||||
|
||||
for (let i = 0; i < numberOfPages; i++) {
|
||||
console.log(i);
|
||||
if(i == splitAfter) {
|
||||
if(pagesArray.length > 0) {
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray));
|
||||
pagesArray = [];
|
||||
}
|
||||
splitAfter = splitAtPages.shift();
|
||||
@@ -65,16 +57,16 @@ export async function splitOn(snapshot, type, whiteThreashold, OpenCV, PDFLib, j
|
||||
}
|
||||
}
|
||||
if(pagesArray.length > 0) {
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray, PDFLib));
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray));
|
||||
}
|
||||
pagesArray = [];
|
||||
|
||||
return subDocuments;
|
||||
|
||||
async function getPagesWithQRCode(snapshot) {
|
||||
async function getPagesWithQRCode(snapshot: string | ArrayBuffer | URL | TypedArray | DocumentInitParameters) {
|
||||
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
||||
|
||||
const pagesWithQR = [];
|
||||
const pagesWithQR: number[] = [];
|
||||
for (let i = 0; i < pdfDoc.numPages; i++) {
|
||||
console.log("Page:", i, "/", pdfDoc.numPages);
|
||||
const page = await pdfDoc.getPage(i + 1);
|
||||
@@ -1,9 +1,10 @@
|
||||
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
|
||||
import { createSubDocument } from "./shared/extractPages.js";
|
||||
import { createSubDocument } from "./createSubDocument.js";
|
||||
|
||||
export async function splitPDF(snapshot: string | Uint8Array | ArrayBuffer, splitAfterPageArray: number[]): Promise<Uint8Array[]> {
|
||||
|
||||
export async function splitPDF(snapshot, splitAfterPageArray) {
|
||||
const pdfDoc = await PDFDocument.load(snapshot)
|
||||
|
||||
const numberOfPages = pdfDoc.getPages().length;
|
||||
@@ -13,7 +14,7 @@ export async function splitPDF(snapshot, splitAfterPageArray) {
|
||||
const subDocuments = [];
|
||||
|
||||
for (let i = 0; i < numberOfPages; i++) {
|
||||
if(i > splitAfter && pagesArray.length > 0) {
|
||||
if(splitAfter && i > splitAfter && pagesArray.length > 0) {
|
||||
subDocuments.push(await createSubDocument(pdfDoc, pagesArray));
|
||||
splitAfter = splitAfterPageArray.shift();
|
||||
pagesArray = [];
|
||||
Reference in New Issue
Block a user