misc beginings

This commit is contained in:
Anthony Stirling
2023-09-09 18:21:55 +01:00
parent db70d67180
commit 872f562aad
25 changed files with 221 additions and 73 deletions

View File

@@ -0,0 +1,129 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.misc.ExtractHeaderRequest;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class AutoRenameController {
private static final Logger logger = LoggerFactory.getLogger(AutoRenameController.class);
private static final float TITLE_FONT_SIZE_THRESHOLD = 20.0f;
private static final int LINE_LIMIT = 11;
@PostMapping(consumes = "multipart/form-data", value = "/auto-rename")
@Operation(summary = "Extract header from PDF file", description = "This endpoint accepts a PDF file and attempts to extract its title or header based on heuristics. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> extractHeader(@ModelAttribute ExtractHeaderRequest request) throws Exception {
MultipartFile file = request.getFileInput();
Boolean useFirstTextAsFallback = request.getUseFirstTextAsFallback();
PDDocument document = PDDocument.load(file.getInputStream());
PDFTextStripper reader = new PDFTextStripper() {
class LineInfo {
String text;
float fontSize;
LineInfo(String text, float fontSize) {
this.text = text;
this.fontSize = fontSize;
}
}
List<LineInfo> lineInfos = new ArrayList<>();
StringBuilder lineBuilder = new StringBuilder();
float lastY = -1;
float maxFontSizeInLine = 0.0f;
int lineCount = 0;
@Override
protected void processTextPosition(TextPosition text) {
if (lastY != text.getY() && lineCount < LINE_LIMIT) {
processLine();
lineBuilder = new StringBuilder(text.getUnicode());
maxFontSizeInLine = text.getFontSizeInPt();
lastY = text.getY();
lineCount++;
} else if (lineCount < LINE_LIMIT) {
lineBuilder.append(text.getUnicode());
if (text.getFontSizeInPt() > maxFontSizeInLine) {
maxFontSizeInLine = text.getFontSizeInPt();
}
}
}
private void processLine() {
if (lineBuilder.length() > 0 && lineCount < LINE_LIMIT) {
lineInfos.add(new LineInfo(lineBuilder.toString(), maxFontSizeInLine));
}
}
@Override
public String getText(PDDocument doc) throws IOException {
this.lineInfos.clear();
this.lineBuilder = new StringBuilder();
this.lastY = -1;
this.maxFontSizeInLine = 0.0f;
this.lineCount = 0;
super.getText(doc);
processLine(); // Process the last line
// Merge lines with same font size
List<LineInfo> mergedLineInfos = new ArrayList<>();
for (int i = 0; i < lineInfos.size(); i++) {
String mergedText = lineInfos.get(i).text;
float fontSize = lineInfos.get(i).fontSize;
while (i + 1 < lineInfos.size() && lineInfos.get(i + 1).fontSize == fontSize) {
mergedText += " " + lineInfos.get(i + 1).text;
i++;
}
mergedLineInfos.add(new LineInfo(mergedText, fontSize));
}
// Sort lines by font size in descending order and get the first one
mergedLineInfos.sort(Comparator.comparing((LineInfo li) -> li.fontSize).reversed());
String title = mergedLineInfos.isEmpty() ? null : mergedLineInfos.get(0).text;
return title != null ? title : (useFirstTextAsFallback ? (mergedLineInfos.isEmpty() ? null : mergedLineInfos.get(mergedLineInfos.size() - 1).text) : null);
}
};
String header = reader.getText(document);
// Sanitize the header string by removing characters not allowed in a filename.
if (header != null && header.length() < 255) {
header = header.replaceAll("[/\\\\?%*:|\"<>]", "");
return WebResponseUtils.pdfDocToWebResponse(document, header + ".pdf");
} else {
logger.info("File has no good title to be found");
return WebResponseUtils.pdfDocToWebResponse(document, file.getOriginalFilename());
}
}
}

View File

@@ -0,0 +1,142 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage;
import java.awt.image.DataBufferByte;
import java.awt.image.DataBufferInt;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import com.google.zxing.BinaryBitmap;
import com.google.zxing.LuminanceSource;
import com.google.zxing.MultiFormatReader;
import com.google.zxing.NotFoundException;
import com.google.zxing.PlanarYUVLuminanceSource;
import com.google.zxing.Result;
import com.google.zxing.common.HybridBinarizer;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import stirling.software.SPDF.model.api.misc.AutoSplitPdfRequest;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
public class AutoSplitPdfController {
private static final String QR_CONTENT = "https://github.com/Frooodle/Stirling-PDF";
@PostMapping(value = "/auto-split-pdf", consumes = "multipart/form-data")
@Operation(summary = "Auto split PDF pages into separate documents", description = "This endpoint accepts a PDF file, scans each page for a specific QR code, and splits the document at the QR code boundaries. The output is a zip file containing each separate PDF document. Input:PDF Output:ZIP Type:SISO")
public ResponseEntity<byte[]> autoSplitPdf(@ModelAttribute AutoSplitPdfRequest request) throws IOException {
MultipartFile file = request.getFileInput();
boolean duplexMode = request.isDuplexMode();
InputStream inputStream = file.getInputStream();
PDDocument document = PDDocument.load(inputStream);
PDFRenderer pdfRenderer = new PDFRenderer(document);
List<PDDocument> splitDocuments = new ArrayList<>();
List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>();
for (int page = 0; page < document.getNumberOfPages(); ++page) {
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 150);
String result = decodeQRCode(bim);
if (QR_CONTENT.equals(result) && page != 0) {
splitDocuments.add(new PDDocument());
}
if (!splitDocuments.isEmpty() && !QR_CONTENT.equals(result)) {
splitDocuments.get(splitDocuments.size() - 1).addPage(document.getPage(page));
} else if (page == 0) {
PDDocument firstDocument = new PDDocument();
firstDocument.addPage(document.getPage(page));
splitDocuments.add(firstDocument);
}
// If duplexMode is true and current page is a divider, then skip next page
if (duplexMode && QR_CONTENT.equals(result)) {
page++;
}
}
// Remove split documents that have no pages
splitDocuments.removeIf(pdDocument -> pdDocument.getNumberOfPages() == 0);
for (PDDocument splitDocument : splitDocuments) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
splitDocument.save(baos);
splitDocumentsBoas.add(baos);
splitDocument.close();
}
document.close();
Path zipFile = Files.createTempFile("split_documents", ".zip");
String filename = file.getOriginalFilename().replaceFirst("[.][^.]+$", "");
byte[] data;
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
for (int i = 0; i < splitDocumentsBoas.size(); i++) {
String fileName = filename + "_" + (i + 1) + ".pdf";
ByteArrayOutputStream baos = splitDocumentsBoas.get(i);
byte[] pdf = baos.toByteArray();
ZipEntry pdfEntry = new ZipEntry(fileName);
zipOut.putNextEntry(pdfEntry);
zipOut.write(pdf);
zipOut.closeEntry();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
data = Files.readAllBytes(zipFile);
Files.delete(zipFile);
}
return WebResponseUtils.bytesToWebResponse(data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
}
private static String decodeQRCode(BufferedImage bufferedImage) {
LuminanceSource source;
if (bufferedImage.getRaster().getDataBuffer() instanceof DataBufferByte) {
byte[] pixels = ((DataBufferByte) bufferedImage.getRaster().getDataBuffer()).getData();
source = new PlanarYUVLuminanceSource(pixels, bufferedImage.getWidth(), bufferedImage.getHeight(), 0, 0, bufferedImage.getWidth(), bufferedImage.getHeight(), false);
} else if (bufferedImage.getRaster().getDataBuffer() instanceof DataBufferInt) {
int[] pixels = ((DataBufferInt) bufferedImage.getRaster().getDataBuffer()).getData();
byte[] newPixels = new byte[pixels.length];
for (int i = 0; i < pixels.length; i++) {
newPixels[i] = (byte) (pixels[i] & 0xff);
}
source = new PlanarYUVLuminanceSource(newPixels, bufferedImage.getWidth(), bufferedImage.getHeight(), 0, 0, bufferedImage.getWidth(), bufferedImage.getHeight(), false);
} else {
throw new IllegalArgumentException("BufferedImage must have 8-bit gray scale, 24-bit RGB, 32-bit ARGB (packed int), byte gray, or 3-byte/4-byte RGB image data");
}
BinaryBitmap bitmap = new BinaryBitmap(new HybridBinarizer(source));
try {
Result result = new MultiFormatReader().decode(bitmap);
return result.getText();
} catch (NotFoundException e) {
return null; // there is no QR code in the image
}
}
}

View File

@@ -0,0 +1,123 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.misc.RemoveBlankPagesRequest;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class BlankPageController {
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
@Operation(
summary = "Remove blank pages from a PDF file",
description = "This endpoint removes blank pages from a given PDF file. Users can specify the threshold and white percentage to tune the detection of blank pages. Input:PDF Output:PDF Type:SISO"
)
public ResponseEntity<byte[]> removeBlankPages(@ModelAttribute RemoveBlankPagesRequest request) throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput();
int threshold = request.getThreshold();
float whitePercent = request.getWhitePercent();
PDDocument document = null;
try {
document = PDDocument.load(inputFile.getInputStream());
PDPageTree pages = document.getDocumentCatalog().getPages();
PDFTextStripper textStripper = new PDFTextStripper();
List<Integer> pagesToKeepIndex = new ArrayList<>();
int pageIndex = 0;
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (PDPage page : pages) {
System.out.println("checking page " + pageIndex);
textStripper.setStartPage(pageIndex + 1);
textStripper.setEndPage(pageIndex + 1);
String pageText = textStripper.getText(document);
boolean hasText = !pageText.trim().isEmpty();
if (hasText) {
pagesToKeepIndex.add(pageIndex);
System.out.println("page " + pageIndex + " has text");
} else {
boolean hasImages = PdfUtils.hasImagesOnPage(page);
if (hasImages) {
System.out.println("page " + pageIndex + " has image");
Path tempFile = Files.createTempFile("image_", ".png");
// Render image and save as temp file
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 300);
ImageIO.write(image, "png", tempFile.toFile());
List<String> command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "/scripts/detect-blank-pages.py", tempFile.toString() ,"--threshold", String.valueOf(threshold), "--white_percent", String.valueOf(whitePercent)));
// Run CLI command
ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
// does contain data
if (returnCode.getRc() == 0) {
System.out.println("page " + pageIndex + " has image which is not blank");
pagesToKeepIndex.add(pageIndex);
} else {
System.out.println("Skipping, Image was blank for page #" + pageIndex);
}
}
}
pageIndex++;
}
System.out.print("pagesToKeep=" + pagesToKeepIndex.size());
// Remove pages not present in pagesToKeepIndex
List<Integer> pageIndices = IntStream.range(0, pages.getCount()).boxed().collect(Collectors.toList());
Collections.reverse(pageIndices); // Reverse to prevent index shifting during removal
for (Integer i : pageIndices) {
if (!pagesToKeepIndex.contains(i)) {
pages.remove(i);
}
}
return WebResponseUtils.pdfDocToWebResponse(document, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf");
} catch (IOException e) {
e.printStackTrace();
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
} finally {
if (document != null)
document.close();
}
}
}

View File

@@ -0,0 +1,244 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.media.Schema;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.misc.OptimizePdfRequest;
import stirling.software.SPDF.utils.GeneralUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class CompressController {
private static final Logger logger = LoggerFactory.getLogger(CompressController.class);
@PostMapping(consumes = "multipart/form-data", value = "/compress-pdf")
@Operation(summary = "Optimize PDF file", description = "This endpoint accepts a PDF file and optimizes it based on the provided parameters. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> optimizePdf(@ModelAttribute OptimizePdfRequest request) throws Exception {
MultipartFile inputFile = request.getFileInput();
Integer optimizeLevel = request.getOptimizeLevel();
String expectedOutputSizeString = request.getExpectedOutputSizeString();
if(expectedOutputSizeString == null && optimizeLevel == null) {
throw new Exception("Both expected output size and optimize level are not specified");
}
Long expectedOutputSize = 0L;
boolean autoMode = false;
if (expectedOutputSizeString != null && expectedOutputSizeString.length() > 1 ) {
expectedOutputSize = GeneralUtils.convertSizeToBytes(expectedOutputSizeString);
autoMode = true;
}
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile.toFile());
long inputFileSize = Files.size(tempInputFile);
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Determine initial optimization level based on expected size reduction, only if in autoMode
if(autoMode) {
double sizeReductionRatio = expectedOutputSize / (double) inputFileSize;
if (sizeReductionRatio > 0.7) {
optimizeLevel = 1;
} else if (sizeReductionRatio > 0.5) {
optimizeLevel = 2;
} else if (sizeReductionRatio > 0.35) {
optimizeLevel = 3;
} else {
optimizeLevel = 3;
}
}
boolean sizeMet = false;
while (!sizeMet && optimizeLevel <= 4) {
// Prepare the Ghostscript command
List<String> command = new ArrayList<>();
command.add("gs");
command.add("-sDEVICE=pdfwrite");
command.add("-dCompatibilityLevel=1.4");
switch (optimizeLevel) {
case 1:
command.add("-dPDFSETTINGS=/prepress");
break;
case 2:
command.add("-dPDFSETTINGS=/printer");
break;
case 3:
command.add("-dPDFSETTINGS=/ebook");
break;
case 4:
command.add("-dPDFSETTINGS=/screen");
break;
default:
command.add("-dPDFSETTINGS=/default");
}
command.add("-dNOPAUSE");
command.add("-dQUIET");
command.add("-dBATCH");
command.add("-sOutputFile=" + tempOutputFile.toString());
command.add(tempInputFile.toString());
ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(command);
// Check if file size is within expected size or not auto mode so instantly finish
long outputFileSize = Files.size(tempOutputFile);
if (outputFileSize <= expectedOutputSize || !autoMode) {
sizeMet = true;
} else {
// Increase optimization level for next iteration
optimizeLevel++;
if(autoMode && optimizeLevel > 3) {
System.out.println("Skipping level 4 due to bad results in auto mode");
sizeMet = true;
} else if(optimizeLevel == 5) {
} else {
System.out.println("Increasing ghostscript optimisation level to " + optimizeLevel);
}
}
}
if (expectedOutputSize != null && autoMode) {
long outputFileSize = Files.size(tempOutputFile);
if (outputFileSize > expectedOutputSize) {
try (PDDocument doc = PDDocument.load(new File(tempOutputFile.toString()))) {
long previousFileSize = 0;
double scaleFactor = 1.0;
while (true) {
for (PDPage page : doc.getPages()) {
PDResources res = page.getResources();
for (COSName name : res.getXObjectNames()) {
PDXObject xobj = res.getXObject(name);
if (xobj instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject) xobj;
// Get the image in BufferedImage format
BufferedImage bufferedImage = image.getImage();
// Calculate the new dimensions
int newWidth = (int)(bufferedImage.getWidth() * scaleFactor);
int newHeight = (int)(bufferedImage.getHeight() * scaleFactor);
// If the new dimensions are zero, skip this iteration
if (newWidth == 0 || newHeight == 0) {
continue;
}
// Otherwise, proceed with the scaling
Image scaledImage = bufferedImage.getScaledInstance(newWidth, newHeight, Image.SCALE_SMOOTH);
// Convert the scaled image back to a BufferedImage
BufferedImage scaledBufferedImage = new BufferedImage(newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
scaledBufferedImage.getGraphics().drawImage(scaledImage, 0, 0, null);
// Compress the scaled image
ByteArrayOutputStream compressedImageStream = new ByteArrayOutputStream();
ImageIO.write(scaledBufferedImage, "jpeg", compressedImageStream);
byte[] imageBytes = compressedImageStream.toByteArray();
compressedImageStream.close();
// Convert compressed image back to PDImageXObject
ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes);
PDImageXObject compressedImage = PDImageXObject.createFromByteArray(doc, imageBytes, image.getCOSObject().toString());
// Replace the image in the resources with the compressed version
res.put(name, compressedImage);
}
}
}
// save the document to tempOutputFile again
doc.save(tempOutputFile.toString());
long currentSize = Files.size(tempOutputFile);
// Check if the overall PDF size is still larger than expectedOutputSize
if (currentSize > expectedOutputSize) {
// Log the current file size and scaleFactor
System.out.println("Current file size: " + FileUtils.byteCountToDisplaySize(currentSize));
System.out.println("Current scale factor: " + scaleFactor);
// The file is still too large, reduce scaleFactor and try again
scaleFactor *= 0.9; // reduce scaleFactor by 10%
// Avoid scaleFactor being too small, causing the image to shrink to 0
if(scaleFactor < 0.2 || previousFileSize == currentSize){
throw new RuntimeException("Could not reach the desired size without excessively degrading image quality, lowest size recommended is " + FileUtils.byteCountToDisplaySize(currentSize) + ", " + currentSize + " bytes");
}
previousFileSize = currentSize;
} else {
// The file is small enough, break the loop
break;
}
}
}
}
}
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Check if optimized file is larger than the original
if(pdfBytes.length > inputFileSize) {
// Log the occurrence
logger.warn("Optimized file is larger than the original. Returning the original file instead.");
// Read the original file again
pdfBytes = Files.readAllBytes(tempInputFile);
}
// Clean up the temporary files
Files.delete(tempInputFile);
Files.delete(tempOutputFile);
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_Optimized.pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}

View File

@@ -0,0 +1,155 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.parameters.RequestBody;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.misc.ExtractImageScansRequest;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
import stirling.software.SPDF.utils.WebResponseUtils;
import io.swagger.v3.oas.annotations.media.Content;
import io.swagger.v3.oas.annotations.media.Schema;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class ExtractImageScansController {
private static final Logger logger = LoggerFactory.getLogger(ExtractImageScansController.class);
@PostMapping(consumes = "multipart/form-data", value = "/extract-image-scans")
@Operation(summary = "Extract image scans from an input file",
description = "This endpoint extracts image scans from a given file based on certain parameters. Users can specify angle threshold, tolerance, minimum area, minimum contour area, and border size. Input:PDF Output:IMAGE/ZIP Type:SIMO")
public ResponseEntity<byte[]> extractImageScans(
@RequestBody(
description = "Form data containing file and extraction parameters",
required = true,
content = @Content(
mediaType = "multipart/form-data",
schema = @Schema(implementation = ExtractImageScansRequest.class) // This should represent your form's structure
)
)
ExtractImageScansRequest form) throws IOException, InterruptedException {
String fileName = form.getFileInput().getOriginalFilename();
String extension = fileName.substring(fileName.lastIndexOf(".") + 1);
List<String> images = new ArrayList<>();
// Check if input file is a PDF
if (extension.equalsIgnoreCase("pdf")) {
// Load PDF document
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(form.getFileInput().getBytes()))) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCount = document.getNumberOfPages();
images = new ArrayList<>();
// Create images of all pages
for (int i = 0; i < pageCount; i++) {
// Create temp file to save the image
Path tempFile = Files.createTempFile("image_", ".png");
// Render image and save as temp file
BufferedImage image = pdfRenderer.renderImageWithDPI(i, 300);
ImageIO.write(image, "png", tempFile.toFile());
// Add temp file path to images list
images.add(tempFile.toString());
}
}
} else {
Path tempInputFile = Files.createTempFile("input_", "." + extension);
Files.copy(form.getFileInput().getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
// Add input file path to images list
images.add(tempInputFile.toString());
}
List<byte[]> processedImageBytes = new ArrayList<>();
// Process each image
for (int i = 0; i < images.size(); i++) {
Path tempDir = Files.createTempDirectory("openCV_output");
List<String> command = new ArrayList<>(Arrays.asList(
"python3",
"./scripts/split_photos.py",
images.get(i),
tempDir.toString(),
"--angle_threshold", String.valueOf(form.getAngleThreshold()),
"--tolerance", String.valueOf(form.getTolerance()),
"--min_area", String.valueOf(form.getMinArea()),
"--min_contour_area", String.valueOf(form.getMinContourArea()),
"--border_size", String.valueOf(form.getBorderSize())
));
// Run CLI command
ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
// Read the output photos in temp directory
List<Path> tempOutputFiles = Files.list(tempDir).sorted().collect(Collectors.toList());
for (Path tempOutputFile : tempOutputFiles) {
byte[] imageBytes = Files.readAllBytes(tempOutputFile);
processedImageBytes.add(imageBytes);
}
// Clean up the temporary directory
FileUtils.deleteDirectory(tempDir.toFile());
}
// Create zip file if multiple images
if (processedImageBytes.size() > 1) {
String outputZipFilename = fileName.replaceFirst("[.][^.]+$", "") + "_processed.zip";
Path tempZipFile = Files.createTempFile("output_", ".zip");
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
// Add processed images to the zip
for (int i = 0; i < processedImageBytes.size(); i++) {
ZipEntry entry = new ZipEntry(fileName.replaceFirst("[.][^.]+$", "") + "_" + (i + 1) + ".png");
zipOut.putNextEntry(entry);
zipOut.write(processedImageBytes.get(i));
zipOut.closeEntry();
}
}
byte[] zipBytes = Files.readAllBytes(tempZipFile);
// Clean up the temporary zip file
Files.delete(tempZipFile);
return WebResponseUtils.bytesToWebResponse(zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
} else {
// Return the processed image as a response
byte[] imageBytes = processedImageBytes.get(0);
return WebResponseUtils.bytesToWebResponse(imageBytes, fileName.replaceFirst("[.][^.]+$", "") + ".png", MediaType.IMAGE_PNG);
}
}
}

View File

@@ -0,0 +1,114 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.Graphics2D;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.awt.image.RenderedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.zip.Deflater;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.media.Schema;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class ExtractImagesController {
private static final Logger logger = LoggerFactory.getLogger(ExtractImagesController.class);
@PostMapping(consumes = "multipart/form-data", value = "/extract-images")
@Operation(summary = "Extract images from a PDF file",
description = "This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input:PDF Output:IMAGE/ZIP Type:SIMO")
public ResponseEntity<byte[]> extractImages(
@RequestPart(required = true, value = "fileInput")
@Parameter(description = "The input PDF file containing images")
MultipartFile file,
@RequestParam("format")
@Parameter(description = "The output image format e.g., 'png', 'jpeg', or 'gif'", schema = @Schema(allowableValues = {"png", "jpeg", "gif"}))
String format) throws IOException {
System.out.println(System.currentTimeMillis() + "file=" + file.getName() + ", format=" + format);
PDDocument document = PDDocument.load(file.getBytes());
// Create ByteArrayOutputStream to write zip file to byte array
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// Create ZipOutputStream to create zip file
ZipOutputStream zos = new ZipOutputStream(baos);
// Set compression level
zos.setLevel(Deflater.BEST_COMPRESSION);
int imageIndex = 1;
String filename = file.getOriginalFilename().replaceFirst("[.][^.]+$", "");
int pageNum = 1;
// Iterate over each page
for (PDPage page : document.getPages()) {
++pageNum;
// Extract images from page
for (COSName name : page.getResources().getXObjectNames()) {
if (page.getResources().isImageXObject(name)) {
PDImageXObject image = (PDImageXObject) page.getResources().getXObject(name);
// Convert image to desired format
RenderedImage renderedImage = image.getImage();
BufferedImage bufferedImage = null;
if (format.equalsIgnoreCase("png")) {
bufferedImage = new BufferedImage(renderedImage.getWidth(), renderedImage.getHeight(), BufferedImage.TYPE_INT_ARGB);
} else if (format.equalsIgnoreCase("jpeg") || format.equalsIgnoreCase("jpg")) {
bufferedImage = new BufferedImage(renderedImage.getWidth(), renderedImage.getHeight(), BufferedImage.TYPE_INT_RGB);
} else if (format.equalsIgnoreCase("gif")) {
bufferedImage = new BufferedImage(renderedImage.getWidth(), renderedImage.getHeight(), BufferedImage.TYPE_BYTE_INDEXED);
}
// Write image to zip file
String imageName = filename + "_" + imageIndex + " (Page " + pageNum + ")." + format;
ZipEntry zipEntry = new ZipEntry(imageName);
zos.putNextEntry(zipEntry);
Graphics2D g = bufferedImage.createGraphics();
g.drawImage((Image) renderedImage, 0, 0, null);
g.dispose();
// Write image bytes to zip file
ByteArrayOutputStream imageBaos = new ByteArrayOutputStream();
ImageIO.write(bufferedImage, format, imageBaos);
zos.write(imageBaos.toByteArray());
zos.closeEntry();
imageIndex++;
}
}
}
// Close ZipOutputStream and PDDocument
zos.close();
document.close();
// Create ByteArrayResource from byte array
byte[] zipContents = baos.toByteArray();
return WebResponseUtils.boasToWebResponse(baos, filename + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM);
}
}

View File

@@ -0,0 +1,150 @@
package stirling.software.SPDF.controller.api.misc;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.image.AffineTransformOp;
//Required for image manipulation
import java.awt.image.BufferedImage;
import java.awt.image.BufferedImageOp;
import java.awt.image.ConvolveOp;
import java.awt.image.Kernel;
import java.awt.image.RescaleOp;
import java.io.ByteArrayOutputStream;
//Required for file input/output
import java.io.File;
import java.io.IOException;
//Other required classes
import java.util.Random;
//Required for image input/output
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Hidden;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class FakeScanControllerWIP {
private static final Logger logger = LoggerFactory.getLogger(FakeScanControllerWIP.class);
//TODO
@Hidden
@PostMapping(consumes = "multipart/form-data", value = "/fakeScan")
@Operation(
summary = "Repair a PDF file",
description = "This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response."
)
public ResponseEntity<byte[]> repairPdf(
@RequestPart(required = true, value = "fileInput")
@Parameter(description = "The input PDF file to be repaired", required = true)
MultipartFile inputFile) throws IOException, InterruptedException {
PDDocument document = PDDocument.load(inputFile.getBytes());
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int page = 0; page < document.getNumberOfPages(); ++page)
{
BufferedImage image = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
ImageIO.write(image, "png", new File("scanned-" + (page+1) + ".png"));
}
document.close();
// Constants
int scannedness = 90; // Value between 0 and 100
int dirtiness = 0; // Value between 0 and 100
// Load the source image
BufferedImage sourceImage = ImageIO.read(new File("scanned-1.png"));
// Create the destination image
BufferedImage destinationImage = new BufferedImage(sourceImage.getWidth(), sourceImage.getHeight(), sourceImage.getType());
// Apply a brightness and contrast effect based on the "scanned-ness"
float scaleFactor = 1.0f + (scannedness / 100.0f) * 0.5f; // Between 1.0 and 1.5
float offset = scannedness * 1.5f; // Between 0 and 150
BufferedImageOp op = new RescaleOp(scaleFactor, offset, null);
op.filter(sourceImage, destinationImage);
// Apply a rotation effect
double rotationRequired = Math.toRadians((new Random().nextInt(3 - 1) + 1)); // Random angle between 1 and 3 degrees
double locationX = destinationImage.getWidth() / 2;
double locationY = destinationImage.getHeight() / 2;
AffineTransform tx = AffineTransform.getRotateInstance(rotationRequired, locationX, locationY);
AffineTransformOp rotateOp = new AffineTransformOp(tx, AffineTransformOp.TYPE_BILINEAR);
destinationImage = rotateOp.filter(destinationImage, null);
// Apply a blur effect based on the "scanned-ness"
float blurIntensity = scannedness / 100.0f * 0.2f; // Between 0.0 and 0.2
float[] matrix = {
blurIntensity, blurIntensity, blurIntensity,
blurIntensity, blurIntensity, blurIntensity,
blurIntensity, blurIntensity, blurIntensity
};
BufferedImageOp blurOp = new ConvolveOp(new Kernel(3, 3, matrix), ConvolveOp.EDGE_NO_OP, null);
destinationImage = blurOp.filter(destinationImage, null);
// Add noise to the image based on the "dirtiness"
Random random = new Random();
for (int y = 0; y < destinationImage.getHeight(); y++) {
for (int x = 0; x < destinationImage.getWidth(); x++) {
if (random.nextInt(100) < dirtiness) {
// Change the pixel color to black randomly based on the "dirtiness"
destinationImage.setRGB(x, y, Color.BLACK.getRGB());
}
}
}
// Save the image
ImageIO.write(destinationImage, "PNG", new File("scanned-1.png"));
PDDocument documentOut = new PDDocument();
for (int page = 1; page <= document.getNumberOfPages(); ++page)
{
BufferedImage bim = ImageIO.read(new File("scanned-" + page + ".png"));
// Adjust the dimensions of the page
PDPage pdPage = new PDPage(new PDRectangle(bim.getWidth() - 1, bim.getHeight() - 1));
documentOut.addPage(pdPage);
PDImageXObject pdImage = LosslessFactory.createFromImage(documentOut, bim);
PDPageContentStream contentStream = new PDPageContentStream(documentOut, pdPage);
// Draw the image with a slight offset and enlarged dimensions
contentStream.drawImage(pdImage, -1, -1, bim.getWidth() + 2, bim.getHeight() + 2);
contentStream.close();
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
documentOut.save(baos);
documentOut.close();
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_scanned.pdf";
return WebResponseUtils.boasToWebResponse(baos, outputFilename);
}
}

View File

@@ -0,0 +1,168 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class MetadataController {
private String checkUndefined(String entry) {
// Check if the string is "undefined"
if ("undefined".equals(entry)) {
// Return null if it is
return null;
}
// Return the original string if it's not "undefined"
return entry;
}
@PostMapping(consumes = "multipart/form-data", value = "/update-metadata")
@Operation(summary = "Update metadata of a PDF file",
description = "This endpoint allows you to update the metadata of a given PDF file. You can add, modify, or delete standard and custom metadata fields. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> metadata(
@RequestPart(required = true, value = "fileInput")
@Parameter(description = "The input PDF file to update metadata")
MultipartFile pdfFile,
@RequestParam(value = "deleteAll", required = false, defaultValue = "false")
@Parameter(description = "Delete all metadata if set to true")
Boolean deleteAll,
@RequestParam(value = "author", required = false)
@Parameter(description = "The author of the document")
String author,
@RequestParam(value = "creationDate", required = false)
@Parameter(description = "The creation date of the document (format: yyyy/MM/dd HH:mm:ss)")
String creationDate,
@RequestParam(value = "creator", required = false)
@Parameter(description = "The creator of the document")
String creator,
@RequestParam(value = "keywords", required = false)
@Parameter(description = "The keywords for the document")
String keywords,
@RequestParam(value = "modificationDate", required = false)
@Parameter(description = "The modification date of the document (format: yyyy/MM/dd HH:mm:ss)")
String modificationDate,
@RequestParam(value = "producer", required = false)
@Parameter(description = "The producer of the document")
String producer,
@RequestParam(value = "subject", required = false)
@Parameter(description = "The subject of the document")
String subject,
@RequestParam(value = "title", required = false)
@Parameter(description = "The title of the document")
String title,
@RequestParam(value = "trapped", required = false)
@Parameter(description = "The trapped status of the document")
String trapped,
@Parameter(description = "Map list of key and value of custom parameters, note these must start with customKey and customValue if they are non standard")
@RequestParam Map<String, String> allRequestParams)
throws IOException {
// Load the PDF file into a PDDocument
PDDocument document = PDDocument.load(pdfFile.getBytes());
// Get the document information from the PDF
PDDocumentInformation info = document.getDocumentInformation();
// Check if each metadata value is "undefined" and set it to null if it is
author = checkUndefined(author);
creationDate = checkUndefined(creationDate);
creator = checkUndefined(creator);
keywords = checkUndefined(keywords);
modificationDate = checkUndefined(modificationDate);
producer = checkUndefined(producer);
subject = checkUndefined(subject);
title = checkUndefined(title);
trapped = checkUndefined(trapped);
// If the "deleteAll" flag is set, remove all metadata from the document
// information
if (deleteAll) {
for (String key : info.getMetadataKeys()) {
info.setCustomMetadataValue(key, null);
}
// Remove metadata from the PDF history
document.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("Metadata"));
document.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("PieceInfo"));
author = null;
creationDate = null;
creator = null;
keywords = null;
modificationDate = null;
producer = null;
subject = null;
title = null;
trapped = null;
} else {
// Iterate through the request parameters and set the metadata values
for (Entry<String, String> entry : allRequestParams.entrySet()) {
String key = entry.getKey();
// Check if the key is a standard metadata key
if (!key.equalsIgnoreCase("Author") && !key.equalsIgnoreCase("CreationDate") && !key.equalsIgnoreCase("Creator") && !key.equalsIgnoreCase("Keywords")
&& !key.equalsIgnoreCase("modificationDate") && !key.equalsIgnoreCase("Producer") && !key.equalsIgnoreCase("Subject") && !key.equalsIgnoreCase("Title")
&& !key.equalsIgnoreCase("Trapped") && !key.contains("customKey") && !key.contains("customValue")) {
info.setCustomMetadataValue(key, entry.getValue());
} else if (key.contains("customKey")) {
int number = Integer.parseInt(key.replaceAll("\\D", ""));
String customKey = entry.getValue();
String customValue = allRequestParams.get("customValue" + number);
info.setCustomMetadataValue(customKey, customValue);
}
}
}
if (creationDate != null && creationDate.length() > 0) {
Calendar creationDateCal = Calendar.getInstance();
try {
creationDateCal.setTime(new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").parse(creationDate));
} catch (ParseException e) {
e.printStackTrace();
}
info.setCreationDate(creationDateCal);
} else {
info.setCreationDate(null);
}
if (modificationDate != null && modificationDate.length() > 0) {
Calendar modificationDateCal = Calendar.getInstance();
try {
modificationDateCal.setTime(new SimpleDateFormat("yyyy/MM/dd HH:mm:ss").parse(modificationDate));
} catch (ParseException e) {
e.printStackTrace();
}
info.setModificationDate(modificationDateCal);
} else {
info.setModificationDate(null);
}
info.setCreator(creator);
info.setKeywords(keywords);
info.setAuthor(author);
info.setProducer(producer);
info.setSubject(subject);
info.setTitle(title);
info.setTrapped(trapped);
document.setDocumentInformation(info);
return WebResponseUtils.pdfDocToWebResponse(document, pdfFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_metadata.pdf");
}
}

View File

@@ -0,0 +1,208 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.media.Schema;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class OCRController {
private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
File[] files = new File(tessdataDir).listFiles();
if (files == null) {
return Collections.emptyList();
}
return Arrays.stream(files).filter(file -> file.getName().endsWith(".traineddata")).map(file -> file.getName().replace(".traineddata", ""))
.filter(lang -> !lang.equalsIgnoreCase("osd")).collect(Collectors.toList());
}
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
@Operation(summary = "Process a PDF file with OCR",
description = "This endpoint processes a PDF file using OCR (Optical Character Recognition). Users can specify languages, sidecar, deskew, clean, cleanFinal, ocrType, ocrRenderType, and removeImagesAfter options. Input:PDF Output:PDF Type:SI-Conditional")
public ResponseEntity<byte[]> processPdfWithOCR(
@RequestPart(required = true, value = "fileInput")
@Parameter(description = "The input PDF file to be processed with OCR")
MultipartFile inputFile,
@RequestParam("languages")
@Parameter(description = "List of languages to use in OCR processing")
List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false)
@Parameter(description = "Include OCR text in a sidecar text file if set to true")
Boolean sidecar,
@RequestParam(name = "deskew", required = false)
@Parameter(description = "Deskew the input file if set to true")
Boolean deskew,
@RequestParam(name = "clean", required = false)
@Parameter(description = "Clean the input file if set to true")
Boolean clean,
@RequestParam(name = "clean-final", required = false)
@Parameter(description = "Clean the final output if set to true")
Boolean cleanFinal,
@RequestParam(name = "ocrType", required = false)
@Parameter(description = "Specify the OCR type, e.g., 'skip-text', 'force-ocr', or 'Normal'", schema = @Schema(allowableValues = {"skip-text", "force-ocr", "Normal"}))
String ocrType,
@RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr")
@Parameter(description = "Specify the OCR render type, either 'hocr' or 'sandwich'", schema = @Schema(allowableValues = {"hocr", "sandwich"}))
String ocrRenderType,
@RequestParam(name = "removeImagesAfter", required = false)
@Parameter(description = "Remove images from the output PDF if set to true")
Boolean removeImagesAfter) throws IOException, InterruptedException {
// --output-type pdfa
if (selectedLanguages == null || selectedLanguages.isEmpty()) {
throw new IOException("Please select at least one language.");
}
if(!ocrRenderType.equals("hocr") && !ocrRenderType.equals("sandwich")) {
throw new IOException("ocrRenderType wrong");
}
// Get available Tesseract languages
List<String> availableLanguages = getAvailableTesseractLanguages();
// Validate selected languages
selectedLanguages = selectedLanguages.stream().filter(availableLanguages::contains).toList();
if (selectedLanguages.isEmpty()) {
throw new IOException("None of the selected languages are valid.");
}
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
Files.copy(inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the output file path
Path sidecarTextPath = null;
// Run OCR Command
String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf", "--pdf-renderer" , ocrRenderType));
if (sidecar != null && sidecar) {
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
command.add("--sidecar");
command.add(sidecarTextPath.toString());
}
if (deskew != null && deskew) {
command.add("--deskew");
}
if (clean != null && clean) {
command.add("--clean");
}
if (cleanFinal != null && cleanFinal) {
command.add("--clean-final");
}
if (ocrType != null && !ocrType.equals("")) {
if ("skip-text".equals(ocrType)) {
command.add("--skip-text");
} else if ("force-ocr".equals(ocrType)) {
command.add("--force-ocr");
} else if ("Normal".equals(ocrType)) {
}
}
command.addAll(Arrays.asList("--language", languageOption, tempInputFile.toString(), tempOutputFile.toString()));
// Run CLI command
ProcessExecutorResult result = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
if(result.getRc() != 0 && result.getMessages().contains("multiprocessing/synchronize.py") && result.getMessages().contains("OSError: [Errno 38] Function not implemented")) {
command.add("--jobs");
command.add("1");
result = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
}
// Remove images from the OCR processed PDF if the flag is set to true
if (removeImagesAfter != null && removeImagesAfter) {
Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
List<String> gsCommand = Arrays.asList("gs", "-sDEVICE=pdfwrite", "-dFILTERIMAGE", "-o", tempPdfWithoutImages.toString(), tempOutputFile.toString());
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(gsCommand);
tempOutputFile = tempPdfWithoutImages;
}
// Read the OCR processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
// Return the OCR processed PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
if (sidecar != null && sidecar) {
// Create a zip file containing both the PDF and the text file
String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
Path tempZipFile = Files.createTempFile("output_", ".zip");
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(outputFilename);
zipOut.putNextEntry(pdfEntry);
Files.copy(tempOutputFile, zipOut);
zipOut.closeEntry();
// Add text file to the zip
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
zipOut.putNextEntry(txtEntry);
Files.copy(sidecarTextPath, zipOut);
zipOut.closeEntry();
}
byte[] zipBytes = Files.readAllBytes(tempZipFile);
// Clean up the temporary zip file
Files.delete(tempZipFile);
Files.delete(tempOutputFile);
Files.delete(sidecarTextPath);
// Return the zip file containing both the PDF and the text file
return WebResponseUtils.bytesToWebResponse(zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
} else {
// Return the OCR processed PDF as a response
Files.delete(tempOutputFile);
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}
}

View File

@@ -0,0 +1,59 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.IOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class OverlayImageController {
private static final Logger logger = LoggerFactory.getLogger(OverlayImageController.class);
@PostMapping(consumes = "multipart/form-data", value = "/add-image")
@Operation(
summary = "Overlay image onto a PDF file",
description = "This endpoint overlays an image onto a PDF file at the specified coordinates. The image can be overlaid on every page of the PDF if specified. Input:PDF/IMAGE Output:PDF Type:MF-SISO"
)
public ResponseEntity<byte[]> overlayImage(
@RequestPart(required = true, value = "fileInput")
@Parameter(description = "The input PDF file to overlay the image onto.", required = true)
MultipartFile pdfFile,
@RequestParam("fileInput2")
@Parameter(description = "The image file to be overlaid onto the PDF.", required = true)
MultipartFile imageFile,
@RequestParam("x")
@Parameter(description = "The x-coordinate at which to place the top-left corner of the image.", example = "0")
float x,
@RequestParam("y")
@Parameter(description = "The y-coordinate at which to place the top-left corner of the image.", example = "0")
float y,
@RequestParam("everyPage")
@Parameter(description = "Whether to overlay the image onto every page of the PDF.", example = "false")
boolean everyPage) {
try {
byte[] pdfBytes = pdfFile.getBytes();
byte[] imageBytes = imageFile.getBytes();
byte[] result = PdfUtils.overlayImage(pdfBytes, imageBytes, x, y, everyPage);
return WebResponseUtils.bytesToWebResponse(result, pdfFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_overlayed.pdf");
} catch (IOException e) {
logger.error("Failed to add image to PDF", e);
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
}
}

View File

@@ -0,0 +1,135 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.media.Schema;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.GeneralUtils;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class PageNumbersController {
private static final Logger logger = LoggerFactory.getLogger(PageNumbersController.class);
@PostMapping(value = "/add-page-numbers", consumes = "multipart/form-data")
@Operation(summary = "Add page numbers to a PDF document", description = "This operation takes an input PDF file and adds page numbers to it. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> addPageNumbers(
@Parameter(description = "The input PDF file", required = true) @RequestParam("fileInput") MultipartFile file,
@Parameter(description = "Custom margin: small/medium/large", required = true, schema = @Schema(type = "string", allowableValues = {"small", "medium", "large"})) @RequestParam("customMargin") String customMargin,
@Parameter(description = "Position: 1 of 9 positions", required = true, schema = @Schema(type = "integer", minimum = "1", maximum = "9")) @RequestParam("position") int position,
@Parameter(description = "Starting number", required = true, schema = @Schema(type = "integer", minimum = "1")) @RequestParam("startingNumber") int startingNumber,
@Parameter(description = "Which pages to number, default all", required = false, schema = @Schema(type = "string")) @RequestParam(value = "pagesToNumber", required = false) String pagesToNumber,
@Parameter(description = "Custom text: defaults to just number but can have things like \"Page {n} of {p}\"", required = false, schema = @Schema(type = "string")) @RequestParam(value = "customText", required = false) String customText)
throws IOException {
int pageNumber = startingNumber;
byte[] fileBytes = file.getBytes();
PDDocument document = PDDocument.load(fileBytes);
float marginFactor;
switch (customMargin.toLowerCase()) {
case "small":
marginFactor = 0.02f;
break;
case "medium":
marginFactor = 0.035f;
break;
case "large":
marginFactor = 0.05f;
break;
case "x-large":
marginFactor = 0.075f;
break;
default:
marginFactor = 0.035f;
break;
}
float fontSize = 12.0f;
PDType1Font font = PDType1Font.HELVETICA;
if(pagesToNumber == null || pagesToNumber.length() == 0) {
pagesToNumber = "all";
}
if(customText == null || customText.length() == 0) {
customText = "{n}";
}
List<Integer> pagesToNumberList = GeneralUtils.parsePageList(pagesToNumber.split(","), document.getNumberOfPages());
for (int i : pagesToNumberList) {
PDPage page = document.getPage(i);
PDRectangle pageSize = page.getMediaBox();
String text = customText != null ? customText.replace("{n}", String.valueOf(pageNumber)).replace("{total}", String.valueOf(document.getNumberOfPages())).replace("{filename}", file.getOriginalFilename().replaceFirst("[.][^.]+$", "")) : String.valueOf(pageNumber);
float x, y;
int xGroup = (position - 1) % 3;
int yGroup = 2 - (position - 1) / 3;
switch (xGroup) {
case 0: // left
x = pageSize.getLowerLeftX() + marginFactor * pageSize.getWidth();
break;
case 1: // center
x = pageSize.getLowerLeftX() + (pageSize.getWidth() / 2);
break;
default: // right
x = pageSize.getUpperRightX() - marginFactor * pageSize.getWidth();
break;
}
switch (yGroup) {
case 0: // bottom
y = pageSize.getLowerLeftY() + marginFactor * pageSize.getHeight();
break;
case 1: // middle
y = pageSize.getLowerLeftY() + (pageSize.getHeight() / 2);
break;
default: // top
y = pageSize.getUpperRightY() - marginFactor * pageSize.getHeight();
break;
}
PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true);
contentStream.beginText();
contentStream.setFont(font, fontSize);
contentStream.newLineAtOffset(x, y);
contentStream.showText(text);
contentStream.endText();
contentStream.close();
pageNumber++;
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
document.save(baos);
document.close();
return WebResponseUtils.bytesToWebResponse(baos.toByteArray(), file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_numbersAdded.pdf", MediaType.APPLICATION_PDF);
}
}

View File

@@ -0,0 +1,69 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class RepairController {
private static final Logger logger = LoggerFactory.getLogger(RepairController.class);
@PostMapping(consumes = "multipart/form-data", value = "/repair")
@Operation(
summary = "Repair a PDF file",
description = "This endpoint repairs a given PDF file by running Ghostscript command. The PDF is first saved to a temporary location, repaired, read back, and then returned as a response. Input:PDF Output:PDF Type:SISO"
)
public ResponseEntity<byte[]> repairPdf(
@RequestPart(required = true, value = "fileInput")
@Parameter(description = "The input PDF file to be repaired", required = true)
MultipartFile inputFile) throws IOException, InterruptedException {
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile.toFile());
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
List<String> command = new ArrayList<>();
command.add("gs");
command.add("-o");
command.add(tempOutputFile.toString());
command.add("-sDEVICE=pdfwrite");
command.add(tempInputFile.toString());
ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(command);
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
Files.delete(tempOutputFile);
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_repaired.pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
}

View File

@@ -0,0 +1,59 @@
package stirling.software.SPDF.controller.api.misc;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Other", description = "Other APIs")
public class ShowJavascript {
private static final Logger logger = LoggerFactory.getLogger(ShowJavascript.class);
@PostMapping(consumes = "multipart/form-data", value = "/show-javascript")
public ResponseEntity<byte[]> extractHeader(
@RequestPart(value = "fileInput") MultipartFile inputFile) throws Exception {
String script = "";
try (PDDocument document = PDDocument.load(inputFile.getInputStream())) {
if(document.getDocumentCatalog() != null && document.getDocumentCatalog().getNames() != null) {
PDNameTreeNode<PDActionJavaScript> jsTree = document.getDocumentCatalog().getNames().getJavaScript();
if (jsTree != null) {
Map<String, PDActionJavaScript> jsEntries = jsTree.getNames();
for (Map.Entry<String, PDActionJavaScript> entry : jsEntries.entrySet()) {
String name = entry.getKey();
PDActionJavaScript jsAction = entry.getValue();
String jsCodeStr = jsAction.getAction();
script += "// File: " + inputFile.getOriginalFilename() + ", Script: " + name + "\n" + jsCodeStr + "\n";
}
}
}
if (script.isEmpty()) {
script = "PDF '" + inputFile.getOriginalFilename() + "' does not contain Javascript";
}
return WebResponseUtils.bytesToWebResponse(script.getBytes(StandardCharsets.UTF_8), inputFile.getOriginalFilename() + ".js");
}
}
}