Files
Stirling-PDF/src/main/java/stirling/software/SPDF/utils/FileToPdf.java

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

253 lines
10 KiB
Java
Raw Normal View History

2023-08-01 00:03:13 +01:00
package stirling.software.SPDF.utils;
import java.io.*;
2024-10-14 22:34:41 +01:00
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
2023-08-01 00:03:13 +01:00
import java.nio.file.Files;
import java.nio.file.Path;
2024-10-14 22:34:41 +01:00
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
2023-08-01 00:03:13 +01:00
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
2024-10-14 22:34:41 +01:00
import java.util.zip.ZipOutputStream;
2023-08-01 00:03:13 +01:00
2024-02-06 00:00:49 +00:00
import io.github.pixee.security.ZipSecurity;
2024-01-28 17:36:17 +00:00
import stirling.software.SPDF.model.api.converters.HTMLToPdfRequest;
2023-08-01 00:03:13 +01:00
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
public class FileToPdf {
2024-01-09 22:39:21 +00:00
public static byte[] convertHtmlToPdf(
2024-01-28 17:36:17 +00:00
HTMLToPdfRequest request,
byte[] fileBytes,
String fileName,
added option for disabling HTML Sanitize (#2831) # Description of Changes Please provide a summary of the changes, including: - added disableSanitize: false # set to 'true' to disable Sanitize HTML, set to false to enable Sanitize HTML; (can lead to injections in HTML) - Some users uses this on local boxes, and uses Google Fonts, and base64 image src. ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [x] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [x] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: blaz.carli <blaz.carli@arctur.si> Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
2025-02-01 00:36:50 +01:00
boolean htmlFormatsInstalled,
Update sonarqube.yml and removal of gradle keys (#2866) # Description of Changes Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
2025-02-04 10:18:02 +00:00
boolean disableSanitize)
2023-08-01 00:03:13 +01:00
throws IOException, InterruptedException {
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
Path tempInputFile = null;
byte[] pdfBytes;
try {
if (fileName.endsWith(".html")) {
tempInputFile = Files.createTempFile("input_", ".html");
Update sonarqube.yml and removal of gradle keys (#2866) # Description of Changes Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
2025-02-04 10:18:02 +00:00
String sanitizedHtml =
sanitizeHtmlContent(
new String(fileBytes, StandardCharsets.UTF_8), disableSanitize);
2024-10-14 22:34:41 +01:00
Files.write(tempInputFile, sanitizedHtml.getBytes(StandardCharsets.UTF_8));
} else if (fileName.endsWith(".zip")) {
2024-02-09 23:24:25 +00:00
tempInputFile = Files.createTempFile("input_", ".zip");
Files.write(tempInputFile, fileBytes);
added option for disabling HTML Sanitize (#2831) # Description of Changes Please provide a summary of the changes, including: - added disableSanitize: false # set to 'true' to disable Sanitize HTML, set to false to enable Sanitize HTML; (can lead to injections in HTML) - Some users uses this on local boxes, and uses Google Fonts, and base64 image src. ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [x] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [x] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: blaz.carli <blaz.carli@arctur.si> Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
2025-02-01 00:36:50 +01:00
sanitizeHtmlFilesInZip(tempInputFile, disableSanitize);
2024-10-14 22:34:41 +01:00
} else {
throw new IllegalArgumentException("Unsupported file format: " + fileName);
2023-08-01 00:03:13 +01:00
}
List<String> command = new ArrayList<>();
2024-01-09 22:39:21 +00:00
if (!htmlFormatsInstalled) {
command.add("weasyprint");
2024-10-14 22:34:41 +01:00
command.add("-e");
command.add("utf-8");
command.add("-v");
2024-02-09 23:24:25 +00:00
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
2024-01-09 22:39:21 +00:00
} else {
2024-02-09 23:24:25 +00:00
command.add("ebook-convert");
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
command.add("--paper-size");
command.add("a4");
2024-01-28 17:36:17 +00:00
2024-06-05 21:16:22 +01:00
if (request != null && request.getZoom() != 1.0) {
File tempCssFile = Files.createTempFile("customStyle", ".css").toFile();
2024-02-09 23:24:25 +00:00
try (FileWriter writer = new FileWriter(tempCssFile)) {
writer.write("body { zoom: " + request.getZoom() + "; }");
2024-01-28 17:36:17 +00:00
}
2024-02-09 23:24:25 +00:00
command.add("--extra-css");
command.add(tempCssFile.getAbsolutePath());
2024-01-28 17:36:17 +00:00
}
2024-01-09 22:39:21 +00:00
}
2024-01-10 00:33:07 +00:00
2024-10-14 22:34:41 +01:00
ProcessExecutorResult returnCode =
2024-02-09 23:24:25 +00:00
ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
.runCommandWithOutputHandling(command);
2023-08-01 00:03:13 +01:00
pdfBytes = Files.readAllBytes(tempOutputFile);
2024-01-18 23:28:39 +00:00
} catch (IOException e) {
pdfBytes = Files.readAllBytes(tempOutputFile);
if (pdfBytes.length < 1) {
throw e;
}
2023-08-01 00:03:13 +01:00
} finally {
2024-05-27 17:53:18 +01:00
Files.deleteIfExists(tempOutputFile);
Files.deleteIfExists(tempInputFile);
2023-12-30 19:11:27 +00:00
}
2023-08-01 00:03:13 +01:00
return pdfBytes;
2023-12-30 19:11:27 +00:00
}
added option for disabling HTML Sanitize (#2831) # Description of Changes Please provide a summary of the changes, including: - added disableSanitize: false # set to 'true' to disable Sanitize HTML, set to false to enable Sanitize HTML; (can lead to injections in HTML) - Some users uses this on local boxes, and uses Google Fonts, and base64 image src. ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [x] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [x] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: blaz.carli <blaz.carli@arctur.si> Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
2025-02-01 00:36:50 +01:00
private static String sanitizeHtmlContent(String htmlContent, boolean disableSanitize) {
return (!disableSanitize) ? CustomHtmlSanitizer.sanitize(htmlContent) : htmlContent;
2024-10-14 22:34:41 +01:00
}
Update sonarqube.yml and removal of gradle keys (#2866) # Description of Changes Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
2025-02-04 10:18:02 +00:00
private static void sanitizeHtmlFilesInZip(Path zipFilePath, boolean disableSanitize)
throws IOException {
2024-10-14 22:34:41 +01:00
Path tempUnzippedDir = Files.createTempDirectory("unzipped_");
try (ZipInputStream zipIn =
ZipSecurity.createHardenedInputStream(
new ByteArrayInputStream(Files.readAllBytes(zipFilePath)))) {
ZipEntry entry = zipIn.getNextEntry();
while (entry != null) {
Path filePath = tempUnzippedDir.resolve(sanitizeZipFilename(entry.getName()));
2024-10-14 22:34:41 +01:00
if (!entry.isDirectory()) {
Files.createDirectories(filePath.getParent());
if (entry.getName().toLowerCase().endsWith(".html")
|| entry.getName().toLowerCase().endsWith(".htm")) {
String content = new String(zipIn.readAllBytes(), StandardCharsets.UTF_8);
added option for disabling HTML Sanitize (#2831) # Description of Changes Please provide a summary of the changes, including: - added disableSanitize: false # set to 'true' to disable Sanitize HTML, set to false to enable Sanitize HTML; (can lead to injections in HTML) - Some users uses this on local boxes, and uses Google Fonts, and base64 image src. ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [x] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [x] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: blaz.carli <blaz.carli@arctur.si> Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
2025-02-01 00:36:50 +01:00
String sanitizedContent = sanitizeHtmlContent(content, disableSanitize);
2024-10-14 22:34:41 +01:00
Files.write(filePath, sanitizedContent.getBytes(StandardCharsets.UTF_8));
} else {
Files.copy(zipIn, filePath);
}
}
zipIn.closeEntry();
entry = zipIn.getNextEntry();
}
}
// Repack the sanitized files
zipDirectory(tempUnzippedDir, zipFilePath);
// Clean up
deleteDirectory(tempUnzippedDir);
}
private static void zipDirectory(Path sourceDir, Path zipFilePath) throws IOException {
try (ZipOutputStream zos =
new ZipOutputStream(new FileOutputStream(zipFilePath.toFile()))) {
Files.walk(sourceDir)
.filter(path -> !Files.isDirectory(path))
.forEach(
path -> {
ZipEntry zipEntry =
new ZipEntry(sourceDir.relativize(path).toString());
try {
zos.putNextEntry(zipEntry);
Files.copy(path, zos);
zos.closeEntry();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
}
}
private static void deleteDirectory(Path dir) throws IOException {
Files.walkFileTree(
dir,
new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
throws IOException {
Files.delete(file);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc)
throws IOException {
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
});
}
2023-08-01 00:03:13 +01:00
private static Path unzipAndGetMainHtml(byte[] fileBytes) throws IOException {
Path tempDirectory = Files.createTempDirectory("unzipped_");
2024-02-06 00:00:49 +00:00
try (ZipInputStream zipIn =
ZipSecurity.createHardenedInputStream(new ByteArrayInputStream(fileBytes))) {
2023-08-01 00:03:13 +01:00
ZipEntry entry = zipIn.getNextEntry();
while (entry != null) {
Path filePath = tempDirectory.resolve(sanitizeZipFilename(entry.getName()));
2023-08-01 00:03:13 +01:00
if (entry.isDirectory()) {
Files.createDirectories(filePath); // Explicitly create the directory structure
2023-12-30 19:11:27 +00:00
} else {
2023-08-01 00:03:13 +01:00
Files.createDirectories(
filePath.getParent()); // Create parent directories if they don't exist
Files.copy(zipIn, filePath);
2023-12-30 19:11:27 +00:00
}
2023-08-01 00:03:13 +01:00
zipIn.closeEntry();
entry = zipIn.getNextEntry();
2023-12-30 19:11:27 +00:00
}
}
2023-08-01 00:03:13 +01:00
// search for the main HTML file.
try (Stream<Path> walk = Files.walk(tempDirectory)) {
List<Path> htmlFiles =
walk.filter(file -> file.toString().endsWith(".html"))
.collect(Collectors.toList());
2023-12-30 19:11:27 +00:00
2023-08-01 00:03:13 +01:00
if (htmlFiles.isEmpty()) {
throw new IOException("No HTML files found in the unzipped directory.");
2023-12-30 19:11:27 +00:00
}
2023-08-01 00:03:13 +01:00
// Prioritize 'index.html' if it exists, otherwise use the first .html file
for (Path htmlFile : htmlFiles) {
if ("index.html".equals(htmlFile.getFileName().toString())) {
2023-08-01 00:03:13 +01:00
return htmlFile;
2023-12-30 19:11:27 +00:00
}
}
2023-08-01 00:03:13 +01:00
return htmlFiles.get(0);
}
2023-12-30 19:11:27 +00:00
}
2024-01-09 22:39:21 +00:00
public static byte[] convertBookTypeToPdf(byte[] bytes, String originalFilename)
throws IOException, InterruptedException {
if (originalFilename == null || originalFilename.lastIndexOf('.') == -1) {
throw new IllegalArgumentException("Invalid original filename.");
}
String fileExtension = originalFilename.substring(originalFilename.lastIndexOf('.'));
List<String> command = new ArrayList<>();
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
Path tempInputFile = null;
try {
// Create temp file with appropriate extension
tempInputFile = Files.createTempFile("input_", fileExtension);
Files.write(tempInputFile, bytes);
command.add("ebook-convert");
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.CALIBRE)
.runCommandWithOutputHandling(command);
return Files.readAllBytes(tempOutputFile);
} finally {
// Clean up temporary files
if (tempInputFile != null) {
Files.deleteIfExists(tempInputFile);
}
Files.deleteIfExists(tempOutputFile);
}
}
static String sanitizeZipFilename(String entryName) {
if (entryName == null || entryName.trim().isEmpty()) {
return entryName;
}
while (entryName.contains("../") || entryName.contains("..\\")) {
entryName = entryName.replace("../", "").replace("..\\", "");
}
return entryName;
}
2023-08-01 00:03:13 +01:00
}