print (WIP), fake scan (WIP) and text conversion for ultra-lite (#1098)

* Changes!

* lang

* fake scan init, print init and pdf to text for exe

* Hardening suggestions for Stirling-PDF / changes (#1099)

* Switch order of literals to prevent NullPointerException

* Introduced protections against predictable RNG abuse

---------

Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>

* Update README.md

* install custom fonts

* Formats etc

* version bump

* disable WIP work

* remove chinese font

---------

Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
Co-authored-by: systo <systo@host.docker.internal>
This commit is contained in:
Anthony Stirling
2024-04-21 23:06:44 +01:00
committed by GitHub
parent 6c052a7b25
commit 71e93e3cb5
19 changed files with 494 additions and 143 deletions

View File

@@ -62,6 +62,7 @@ public class SPdfApplication {
}
public static void main(String[] args) throws IOException, InterruptedException {
SpringApplication app = new SpringApplication(SPdfApplication.class);
app.addInitializers(new ConfigInitializer());
if (Files.exists(Paths.get("configs/settings.yml"))) {

View File

@@ -146,7 +146,6 @@ public class EndpointConfiguration {
addEndpointToGroup("CLI", "xlsx-to-pdf");
addEndpointToGroup("CLI", "pdf-to-word");
addEndpointToGroup("CLI", "pdf-to-presentation");
addEndpointToGroup("CLI", "pdf-to-text");
addEndpointToGroup("CLI", "pdf-to-html");
addEndpointToGroup("CLI", "pdf-to-xml");
addEndpointToGroup("CLI", "ocr-pdf");
@@ -154,6 +153,7 @@ public class EndpointConfiguration {
addEndpointToGroup("CLI", "url-to-pdf");
addEndpointToGroup("CLI", "book-to-pdf");
addEndpointToGroup("CLI", "pdf-to-book");
addEndpointToGroup("CLI", "pdf-to-rtf");
// Calibre
addEndpointToGroup("Calibre", "book-to-pdf");
@@ -175,7 +175,7 @@ public class EndpointConfiguration {
addEndpointToGroup("LibreOffice", "xlsx-to-pdf");
addEndpointToGroup("LibreOffice", "pdf-to-word");
addEndpointToGroup("LibreOffice", "pdf-to-presentation");
addEndpointToGroup("LibreOffice", "pdf-to-text");
addEndpointToGroup("LibreOffice", "pdf-to-rtf");
addEndpointToGroup("LibreOffice", "pdf-to-html");
addEndpointToGroup("LibreOffice", "pdf-to-xml");
@@ -218,6 +218,7 @@ public class EndpointConfiguration {
addEndpointToGroup("Java", "overlay-pdf");
addEndpointToGroup("Java", "split-pdf-by-sections");
addEndpointToGroup("Java", REMOVE_BLANKS);
addEndpointToGroup("Java", "pdf-to-text");
// Javascript
addEndpointToGroup("Javascript", "pdf-organizer");

View File

@@ -31,7 +31,8 @@ public class ConvertPDFToPDFA {
summary = "Convert a PDF to a PDF/A",
description =
"This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PdfToPdfARequest request) throws Exception {
public ResponseEntity<byte[]> pdfToPdfA(@ModelAttribute PdfToPdfARequest request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat();

View File

@@ -1,27 +1,25 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.GradientPaint;
import java.awt.Graphics2D;
import java.awt.geom.AffineTransform;
import java.awt.image.AffineTransformOp;
import java.awt.image.BufferedImage;
import java.awt.image.BufferedImageOp;
import java.awt.image.ConvolveOp;
import java.awt.image.Kernel;
import java.awt.image.RescaleOp;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
@@ -29,16 +27,17 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Hidden;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@@ -48,98 +47,38 @@ public class FakeScanControllerWIP {
private static final Logger logger = LoggerFactory.getLogger(FakeScanControllerWIP.class);
// TODO
@Hidden
// @PostMapping(consumes = "multipart/form-data", value = "/fakeScan")
@Operation(
summary = "Repair a PDF file",
description =
"This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile request) throws IOException {
//TODO
//@PostMapping(consumes = "multipart/form-data", value = "/fake-scan")
//@Operation(
// summary = "Repair a PDF file",
// description =
// "This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response.")
public ResponseEntity<byte[]> fakeScan(@ModelAttribute PDFFile request) throws IOException {
MultipartFile inputFile = request.getFileInput();
// Load the PDF document
PDDocument document = Loader.loadPDF(inputFile.getBytes());
PDFRenderer pdfRenderer = new PDFRenderer(document);
pdfRenderer.setSubsamplingAllowed(true);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
BufferedImage image = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
ImageIO.write(image, "png", new File("scanned-" + (page + 1) + ".png"));
PDFRenderer renderer = new PDFRenderer(document);
List<BufferedImage> images = new ArrayList<>();
// Convert each page to an image
for (int i = 0; i < document.getNumberOfPages(); i++) {
BufferedImage image = renderer.renderImageWithDPI(i, 150, ImageType.GRAY);
images.add(processImage(image));
}
document.close();
// Constants
int scannedness = 90; // Value between 0 and 100
int dirtiness = 0; // Value between 0 and 100
// Load the source image
BufferedImage sourceImage = ImageIO.read(new File("scanned-1.png"));
// Create the destination image
BufferedImage destinationImage =
new BufferedImage(
sourceImage.getWidth(), sourceImage.getHeight(), sourceImage.getType());
// Apply a brightness and contrast effect based on the "scanned-ness"
float scaleFactor = 1.0f + (scannedness / 100.0f) * 0.5f; // Between 1.0 and 1.5
float offset = scannedness * 1.5f; // Between 0 and 150
BufferedImageOp op = new RescaleOp(scaleFactor, offset, null);
op.filter(sourceImage, destinationImage);
// Apply a rotation effect
double rotationRequired =
Math.toRadians(
(new SecureRandom().nextInt(3 - 1)
+ 1)); // Random angle between 1 and 3 degrees
double locationX = destinationImage.getWidth() / 2;
double locationY = destinationImage.getHeight() / 2;
AffineTransform tx =
AffineTransform.getRotateInstance(rotationRequired, locationX, locationY);
AffineTransformOp rotateOp = new AffineTransformOp(tx, AffineTransformOp.TYPE_BILINEAR);
destinationImage = rotateOp.filter(destinationImage, null);
// Apply a blur effect based on the "scanned-ness"
float blurIntensity = scannedness / 100.0f * 0.2f; // Between 0.0 and 0.2
float[] matrix = {
blurIntensity, blurIntensity, blurIntensity,
blurIntensity, blurIntensity, blurIntensity,
blurIntensity, blurIntensity, blurIntensity
};
BufferedImageOp blurOp =
new ConvolveOp(new Kernel(3, 3, matrix), ConvolveOp.EDGE_NO_OP, null);
destinationImage = blurOp.filter(destinationImage, null);
// Add noise to the image based on the "dirtiness"
Random random = new SecureRandom();
for (int y = 0; y < destinationImage.getHeight(); y++) {
for (int x = 0; x < destinationImage.getWidth(); x++) {
if (random.nextInt(100) < dirtiness) {
// Change the pixel color to black randomly based on the "dirtiness"
destinationImage.setRGB(x, y, Color.BLACK.getRGB());
}
}
}
// Save the image
ImageIO.write(destinationImage, "PNG", new File("scanned-1.png"));
PDDocument documentOut = new PDDocument();
for (int page = 1; page <= document.getNumberOfPages(); ++page) {
BufferedImage bim = ImageIO.read(new File("scanned-" + page + ".png"));
// Adjust the dimensions of the page
PDPage pdPage = new PDPage(new PDRectangle(bim.getWidth() - 1, bim.getHeight() - 1));
documentOut.addPage(pdPage);
PDImageXObject pdImage = LosslessFactory.createFromImage(documentOut, bim);
PDPageContentStream contentStream = new PDPageContentStream(documentOut, pdPage);
// Draw the image with a slight offset and enlarged dimensions
contentStream.drawImage(pdImage, -1, -1, bim.getWidth() + 2, bim.getHeight() + 2);
contentStream.close();
}
// Create a new PDF document with the processed images
ByteArrayOutputStream baos = new ByteArrayOutputStream();
documentOut.save(baos);
documentOut.close();
PDDocument newDocument = new PDDocument();
for (BufferedImage img : images) {
// PDPageContentStream contentStream = new PDPageContentStream(newDocument, new
// PDPage());
PDImageXObject pdImage = JPEGFactory.createFromImage(newDocument, img);
PdfUtils.addImageToDocument(newDocument, pdImage, "maintainAspectRatio", false);
}
newDocument.save(baos);
newDocument.close();
// Return the optimized PDF as a response
String outputFilename =
@@ -148,4 +87,164 @@ public class FakeScanControllerWIP {
+ "_scanned.pdf";
return WebResponseUtils.boasToWebResponse(baos, outputFilename);
}
public BufferedImage processImage(BufferedImage image) {
// Rotation
image = rotate(image);
// image = softenEdges(image, 5);
image = applyGaussianBlur(image, 0.5);
addGaussianNoise(image, 0.25);
image = linearStretch(image);
return image;
}
private BufferedImage rotate(BufferedImage image) {
double rotationRequired = Math.toRadians(1.0);
double locationX = image.getWidth() / 2;
double locationY = image.getHeight() / 2;
AffineTransform tx =
AffineTransform.getRotateInstance(rotationRequired, locationX, locationY);
AffineTransformOp op = new AffineTransformOp(tx, AffineTransformOp.TYPE_BICUBIC);
return op.filter(image, null);
}
private BufferedImage applyGaussianBlur(BufferedImage image, double sigma) {
int radius = 3; // Fixed radius size for simplicity
int size = 2 * radius + 1;
float[] data = new float[size * size];
double sum = 0.0;
for (int i = -radius; i <= radius; i++) {
for (int j = -radius; j <= radius; j++) {
double xDistance = i * i;
double yDistance = j * j;
double g = Math.exp(-(xDistance + yDistance) / (2 * sigma * sigma));
data[(i + radius) * size + j + radius] = (float) g;
sum += g;
}
}
// Normalize the kernel
for (int i = 0; i < data.length; i++) {
data[i] /= sum;
}
Kernel kernel = new Kernel(size, size, data);
BufferedImageOp op = new ConvolveOp(kernel, ConvolveOp.EDGE_NO_OP, null);
return op.filter(image, null);
}
public BufferedImage softenEdges(BufferedImage image, int featherRadius) {
int width = image.getWidth();
int height = image.getHeight();
BufferedImage output = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);
Graphics2D g2 = output.createGraphics();
g2.drawImage(image, 0, 0, null);
g2.setComposite(AlphaComposite.DstIn);
g2.setPaint(
new GradientPaint(
0, 0, new Color(0, 0, 0, 1f), 0, featherRadius, new Color(0, 0, 0, 0f)));
g2.fillRect(0, 0, width, featherRadius); // top edge
g2.setPaint(
new GradientPaint(
0,
height - featherRadius,
new Color(0, 0, 0, 0f),
0,
height,
new Color(0, 0, 0, 1f)));
g2.fillRect(0, height - featherRadius, width, featherRadius); // bottom edge
g2.setPaint(
new GradientPaint(
0, 0, new Color(0, 0, 0, 1f), featherRadius, 0, new Color(0, 0, 0, 0f)));
g2.fillRect(0, 0, featherRadius, height); // left edge
g2.setPaint(
new GradientPaint(
width - featherRadius,
0,
new Color(0, 0, 0, 0f),
width,
0,
new Color(0, 0, 0, 1f)));
g2.fillRect(width - featherRadius, 0, featherRadius, height); // right edge
g2.dispose();
return output;
}
private void addGaussianNoise(BufferedImage image, double strength) {
Random rand = new SecureRandom();
int width = image.getWidth();
int height = image.getHeight();
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
int rgba = image.getRGB(i, j);
int alpha = (rgba >> 24) & 0xff;
int red = (rgba >> 16) & 0xff;
int green = (rgba >> 8) & 0xff;
int blue = rgba & 0xff;
// Apply Gaussian noise
red = (int) (red + rand.nextGaussian() * strength);
green = (int) (green + rand.nextGaussian() * strength);
blue = (int) (blue + rand.nextGaussian() * strength);
// Clamping values to the 0-255 range
red = Math.min(Math.max(0, red), 255);
green = Math.min(Math.max(0, green), 255);
blue = Math.min(Math.max(0, blue), 255);
image.setRGB(i, j, (alpha << 24) | (red << 16) | (green << 8) | blue);
}
}
}
public BufferedImage linearStretch(BufferedImage image) {
int width = image.getWidth();
int height = image.getHeight();
int min = 255;
int max = 0;
// First pass: find the min and max grayscale values
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int rgb = image.getRGB(x, y);
int gray =
(int)
(((rgb >> 16) & 0xff) * 0.299
+ ((rgb >> 8) & 0xff) * 0.587
+ (rgb & 0xff) * 0.114); // Convert to grayscale
if (gray < min) min = gray;
if (gray > max) max = gray;
}
}
// Second pass: stretch the histogram
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int rgb = image.getRGB(x, y);
int alpha = (rgb >> 24) & 0xff;
int red = (rgb >> 16) & 0xff;
int green = (rgb >> 8) & 0xff;
int blue = rgb & 0xff;
// Apply linear stretch to each channel
red = (int) (((red - min) / (float) (max - min)) * 255);
green = (int) (((green - min) / (float) (max - min)) * 255);
blue = (int) (((blue - min) / (float) (max - min)) * 255);
// Set new RGB value maintaining the alpha channel
rgb = (alpha << 24) | (red << 16) | (green << 8) | blue;
image.setRGB(x, y, rgb);
}
}
return image;
}
}

View File

@@ -0,0 +1,106 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.image.BufferedImage;
import java.awt.print.PageFormat;
import java.awt.print.Printable;
import java.awt.print.PrinterException;
import java.awt.print.PrinterJob;
import java.io.IOException;
import java.util.Arrays;
import javax.imageio.ImageIO;
import javax.print.PrintService;
import javax.print.PrintServiceLookup;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.printing.PDFPageable;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.misc.PrintFileRequest;
@RestController
@RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs")
public class PrintFileController {
//TODO
//@PostMapping(value = "/print-file", consumes = "multipart/form-data")
//@Operation(
// summary = "Prints PDF/Image file to a set printer",
// description =
// "Input of PDF or Image along with a printer name/URL/IP to match against to send it to (Fire and forget) Input:Any Output:N/A Type:SISO")
public ResponseEntity<String> printFile(@ModelAttribute PrintFileRequest request)
throws IOException {
MultipartFile file = request.getFileInput();
String printerName = request.getPrinterName();
String contentType = file.getContentType();
try {
// Find matching printer
PrintService[] services = PrintServiceLookup.lookupPrintServices(null, null);
PrintService selectedService =
Arrays.stream(services)
.filter(
service ->
service.getName().toLowerCase().contains(printerName))
.findFirst()
.orElseThrow(
() ->
new IllegalArgumentException(
"No matching printer found"));
System.out.println("Selected Printer: " + selectedService.getName());
if ("application/pdf".equals(contentType)) {
PDDocument document = Loader.loadPDF(file.getBytes());
PrinterJob job = PrinterJob.getPrinterJob();
job.setPrintService(selectedService);
job.setPageable(new PDFPageable(document));
job.print();
document.close();
} else if (contentType.startsWith("image/")) {
BufferedImage image = ImageIO.read(file.getInputStream());
PrinterJob job = PrinterJob.getPrinterJob();
job.setPrintService(selectedService);
job.setPrintable(
new Printable() {
public int print(
Graphics graphics, PageFormat pageFormat, int pageIndex)
throws PrinterException {
if (pageIndex != 0) {
return NO_SUCH_PAGE;
}
Graphics2D g2d = (Graphics2D) graphics;
g2d.translate(
pageFormat.getImageableX(), pageFormat.getImageableY());
g2d.drawImage(
image,
0,
0,
(int) pageFormat.getImageableWidth(),
(int) pageFormat.getImageableHeight(),
null);
return PAGE_EXISTS;
}
});
job.print();
}
return new ResponseEntity<>(
"File printed successfully to " + selectedService.getName(), HttpStatus.OK);
} catch (Exception e) {
System.err.println("Failed to print: " + e.getMessage());
return new ResponseEntity<>(e.getMessage(), HttpStatus.BAD_REQUEST);
}
}
}

View File

@@ -54,6 +54,13 @@ public class OtherWebController {
return "misc/add-page-numbers";
}
@GetMapping("/fake-scan")
@Hidden
public String fakeScanForm(Model model) {
model.addAttribute("currentPage", "fake-scan");
return "misc/fake-scan";
}
@GetMapping("/extract-images")
@Hidden
public String extractImagesForm(Model model) {
@@ -82,6 +89,13 @@ public class OtherWebController {
return "misc/compare";
}
@GetMapping("/print-file")
@Hidden
public String printFileForm(Model model) {
model.addAttribute("currentPage", "print-file");
return "misc/print-file";
}
public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tessdata";
File[] files = new File(tessdataDir).listFiles();

View File

@@ -0,0 +1,15 @@
package stirling.software.SPDF.model.api.misc;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import lombok.EqualsAndHashCode;
import stirling.software.SPDF.model.api.PDFFile;
@Data
@EqualsAndHashCode(callSuper = true)
public class PrintFileRequest extends PDFFile {
@Schema(description = "Name of printer to match against", required = true)
private String printerName;
}

View File

@@ -336,7 +336,7 @@ public class PdfUtils {
}
}
private static void addImageToDocument(
public static void addImageToDocument(
PDDocument doc, PDImageXObject image, String fitOption, boolean autoRotate)
throws IOException {
boolean imageIsLandscape = image.getWidth() > image.getHeight();