Removal of Ghostscript to use qpdf and tesseract directly (#2338)

* navbar fix multi tool and compress location

* release notes and ghostscript removal

* cleanups

* formatting

* update docs

* more

* more

* docs

* release bump

* Hardening suggestions for Stirling-PDF / ghostscript (#2339)

* Protect `readLine()` against DoS

* Sanitized user-provided file names in HTTP multipart uploads

---------

Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>

---------

Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
This commit is contained in:
Anthony Stirling
2024-11-26 20:50:35 +00:00
committed by GitHub
parent 654bc94d44
commit 833b3c45c6
69 changed files with 1106 additions and 665 deletions

View File

@@ -320,12 +320,20 @@ public class ApplicationProperties {
public static class SessionLimit {
private int libreOfficeSessionLimit;
private int pdfToHtmlSessionLimit;
private int ocrMyPdfSessionLimit;
private int pythonOpenCvSessionLimit;
private int ghostScriptSessionLimit;
private int weasyPrintSessionLimit;
private int installAppSessionLimit;
private int calibreSessionLimit;
private int qpdfSessionLimit;
private int tesseractSessionLimit;
public int getQpdfSessionLimit() {
return qpdfSessionLimit > 0 ? qpdfSessionLimit : 2;
}
public int getTesseractSessionLimit() {
return tesseractSessionLimit > 0 ? tesseractSessionLimit : 1;
}
public int getLibreOfficeSessionLimit() {
return libreOfficeSessionLimit > 0 ? libreOfficeSessionLimit : 1;
@@ -335,18 +343,10 @@ public class ApplicationProperties {
return pdfToHtmlSessionLimit > 0 ? pdfToHtmlSessionLimit : 1;
}
public int getOcrMyPdfSessionLimit() {
return ocrMyPdfSessionLimit > 0 ? ocrMyPdfSessionLimit : 2;
}
public int getPythonOpenCvSessionLimit() {
return pythonOpenCvSessionLimit > 0 ? pythonOpenCvSessionLimit : 8;
}
public int getGhostScriptSessionLimit() {
return ghostScriptSessionLimit > 0 ? ghostScriptSessionLimit : 16;
}
public int getWeasyPrintSessionLimit() {
return weasyPrintSessionLimit > 0 ? weasyPrintSessionLimit : 16;
}
@@ -364,12 +364,20 @@ public class ApplicationProperties {
public static class TimeoutMinutes {
private long libreOfficeTimeoutMinutes;
private long pdfToHtmlTimeoutMinutes;
private long ocrMyPdfTimeoutMinutes;
private long pythonOpenCvTimeoutMinutes;
private long ghostScriptTimeoutMinutes;
private long weasyPrintTimeoutMinutes;
private long installAppTimeoutMinutes;
private long calibreTimeoutMinutes;
private long tesseractTimeoutMinutes;
private long qpdfTimeoutMinutes;
public long getTesseractTimeoutMinutes() {
return tesseractTimeoutMinutes > 0 ? tesseractTimeoutMinutes : 30;
}
public long getQpdfTimeoutMinutes() {
return qpdfTimeoutMinutes > 0 ? qpdfTimeoutMinutes : 30;
}
public long getLibreOfficeTimeoutMinutes() {
return libreOfficeTimeoutMinutes > 0 ? libreOfficeTimeoutMinutes : 30;
@@ -379,18 +387,10 @@ public class ApplicationProperties {
return pdfToHtmlTimeoutMinutes > 0 ? pdfToHtmlTimeoutMinutes : 20;
}
public long getOcrMyPdfTimeoutMinutes() {
return ocrMyPdfTimeoutMinutes > 0 ? ocrMyPdfTimeoutMinutes : 30;
}
public long getPythonOpenCvTimeoutMinutes() {
return pythonOpenCvTimeoutMinutes > 0 ? pythonOpenCvTimeoutMinutes : 30;
}
public long getGhostScriptTimeoutMinutes() {
return ghostScriptTimeoutMinutes > 0 ? ghostScriptTimeoutMinutes : 30;
}
public long getWeasyPrintTimeoutMinutes() {
return weasyPrintTimeoutMinutes > 0 ? weasyPrintTimeoutMinutes : 30;
}

View File

@@ -18,4 +18,15 @@ public class OptimizePdfRequest extends PDFFile {
@Schema(description = "The expected output size, e.g. '100MB', '25KB', etc.")
private String expectedOutputSize;
@Schema(
description = "Whether to linearize the PDF for faster web viewing. Default is false.",
defaultValue = "false")
private Boolean linearize = false;
@Schema(
description =
"Whether to normalize the PDF content for better compatibility. Default is true.",
defaultValue = "true")
private Boolean normalize = true;
}

View File

@@ -15,18 +15,6 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
@Schema(description = "List of languages to use in OCR processing")
private List<String> languages;
@Schema(description = "Include OCR text in a sidecar text file if set to true")
private boolean sidecar;
@Schema(description = "Deskew the input file if set to true")
private boolean deskew;
@Schema(description = "Clean the input file if set to true")
private boolean clean;
@Schema(description = "Clean the final output if set to true")
private boolean cleanFinal;
@Schema(
description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'",
allowableValues = {"skip-text", "force-ocr", "Normal"})
@@ -37,7 +25,4 @@ public class ProcessPdfWithOcrRequest extends PDFFile {
allowableValues = {"hocr", "sandwich"},
defaultValue = "hocr")
private String ocrRenderType = "hocr";
@Schema(description = "Remove images from the output PDF if set to true")
private boolean removeImagesAfter;
}