Pdf to markdown (#2730)

# Description

Please provide a summary of the changes, including relevant motivation
and context.

Closes #(issue_number)

## Checklist

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have performed a self-review of my own code
- [ ] I have attached images of the change if it is UI based
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] If my code has heavily changed functionality I have updated
relevant docs on [Stirling-PDFs doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
- [ ] My changes generate no new warnings
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

---------

Co-authored-by: a <a>
This commit is contained in:
Anthony Stirling
2025-01-17 22:18:55 +00:00
committed by GitHub
parent a64acb3126
commit a46a570c8a
15 changed files with 1990 additions and 1723 deletions

View File

@@ -126,6 +126,7 @@ public class EndpointConfiguration {
addEndpointToGroup("Convert", "url-to-pdf");
addEndpointToGroup("Convert", "markdown-to-pdf");
addEndpointToGroup("Convert", "pdf-to-csv");
addEndpointToGroup("Convert", "pdf-to-markdown");
// Adding endpoints to "Security" group
addEndpointToGroup("Security", "add-password");
@@ -243,6 +244,7 @@ public class EndpointConfiguration {
addEndpointToGroup("Java", REMOVE_BLANKS);
addEndpointToGroup("Java", "pdf-to-text");
addEndpointToGroup("Java", "remove-image-pdf");
addEndpointToGroup("Java", "pdf-to-markdown");
// Javascript
addEndpointToGroup("Javascript", "pdf-organizer");
@@ -258,9 +260,11 @@ public class EndpointConfiguration {
// Weasyprint dependent endpoints
addEndpointToGroup("Weasyprint", "html-to-pdf");
addEndpointToGroup("Weasyprint", "url-to-pdf");
addEndpointToGroup("Weasyprint", "markdown-to-pdf");
// Pdftohtml dependent endpoints
addEndpointToGroup("Pdftohtml", "pdf-to-html");
addEndpointToGroup("Pdftohtml", "pdf-to-markdown");
// disabled for now while we resolve issues
disableEndpoint("pdf-to-pdfa");

View File

@@ -44,6 +44,13 @@ public class ConverterWebController {
return "convert/markdown-to-pdf";
}
@GetMapping("/pdf-to-markdown")
@Hidden
public String convertPdfToMarkdownForm(Model model) {
model.addAttribute("currentPage", "pdf-to-markdown");
return "convert/pdf-to-markdown";
}
@GetMapping("/url-to-pdf")
@Hidden
public String convertURLToPdfForm(Model model) {

View File

@@ -0,0 +1,32 @@
package stirling.software.SPDF.model.api.converters;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFFile;
import stirling.software.SPDF.utils.PDFToFile;
@RestController
@Tag(name = "Convert", description = "Convert APIs")
@RequestMapping("/api/v1/convert")
public class ConvertPDFToMarkdown {
@PostMapping(consumes = "multipart/form-data", value = "/pdf/markdown")
@Operation(
summary = "Convert PDF to Markdown",
description =
"This endpoint converts a PDF file to Markdown format. Input:PDF Output:Markdown Type:SISO")
public ResponseEntity<byte[]> processPdfToMarkdown(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToMarkdown(inputFile);
}
}

View File

@@ -20,6 +20,9 @@ import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.multipart.MultipartFile;
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
import com.vladsch.flexmark.util.data.MutableDataSet;
import io.github.pixee.security.Filenames;
import lombok.extern.slf4j.Slf4j;
@@ -28,6 +31,123 @@ import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
@Slf4j
public class PDFToFile {
public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!"application/pdf".equals(inputFile.getContentType())) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
MutableDataSet options =
new MutableDataSet()
.set(
FlexmarkHtmlConverter.MAX_BLANK_LINES,
2) // Control max consecutive blank lines
.set(
FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,
1) // Control trailing blank lines
.set(
FlexmarkHtmlConverter.SETEXT_HEADINGS,
true) // Use Setext headings for h1 and h2
.set(
FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,
false) // Don't output HTML for unknown tags
.set(
FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,
true) // Convert quotation marks
.set(
FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,
true) // Convert <br> to paragraph breaks
.set(FlexmarkHtmlConverter.CODE_INDENT, " "); // Indent for code blocks
FlexmarkHtmlConverter htmlToMarkdownConverter =
FlexmarkHtmlConverter.builder(options).build();
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
String pdfBaseName = originalPdfFileName;
if (originalPdfFileName.contains(".")) {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes;
String fileName = "temp.file";
try {
tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile);
tempOutputDir = Files.createTempDirectory("output_");
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml",
"-s",
"-noframes",
"-c",
tempInputFile.toString(),
pdfBaseName));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
// Process HTML files to Markdown
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
List<File> markdownFiles = new ArrayList<>();
// Convert HTML files to Markdown
for (File outputFile : outputFiles) {
if (outputFile.getName().endsWith(".html")) {
String html = Files.readString(outputFile.toPath());
String markdown = htmlToMarkdownConverter.convert(html);
String mdFileName = outputFile.getName().replace(".html", ".md");
File mdFile = new File(tempOutputDir.toFile(), mdFileName);
Files.writeString(mdFile.toPath(), markdown);
markdownFiles.add(mdFile);
}
}
// If there's only one markdown file, return it directly
if (markdownFiles.size() == 1) {
fileName = pdfBaseName + ".md";
fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
} else {
// Multiple files - create a zip
fileName = pdfBaseName + "ToMarkdown.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
// Add markdown files
for (File mdFile : markdownFiles) {
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
zipOutputStream.putNextEntry(mdEntry);
Files.copy(mdFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
// Add images and other assets
for (File file : outputFiles) {
if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
ZipEntry assetEntry = new ZipEntry(file.getName());
zipOutputStream.putNextEntry(assetEntry);
Files.copy(file.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
}
}
fileBytes = byteArrayOutputStream.toByteArray();
}
} finally {
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!"application/pdf".equals(inputFile.getContentType())) {

View File

@@ -450,8 +450,11 @@ HTMLToPDF.tags=markup,web-content,transformation,convert
home.MarkdownToPDF.title=Markdown to PDF
home.MarkdownToPDF.desc=Converts any Markdown file to PDF
MarkdownToPDF.tags=markup,web-content,transformation,convert
MarkdownToPDF.tags=markup,web-content,transformation,convert,md
home.PDFToMarkdown.title=PDF to Markdown
home.PDFToMarkdown.desc=Converts any PDF to Markdown
PDFToMarkdown.tags=markup,web-content,transformation,convert,md
home.getPdfInfo.title=Get ALL Info on PDF
home.getPdfInfo.desc=Grabs any and all information possible on PDFs
@@ -646,6 +649,11 @@ MarkdownToPDF.help=Work in progress
MarkdownToPDF.credit=Uses WeasyPrint
#pdf-to-markdown
PDFToMarkdown.title=PDF To Markdown
PDFToMarkdown.header=PDF To Markdown
PDFToMarkdown.submit=Convert
#url-to-pdf
URLToPDF.title=URL To PDF

View File

@@ -450,8 +450,11 @@ HTMLToPDF.tags=markup,web-content,transformation,convert
home.MarkdownToPDF.title=Markdown to PDF
home.MarkdownToPDF.desc=Converts any Markdown file to PDF
MarkdownToPDF.tags=markup,web-content,transformation,convert
MarkdownToPDF.tags=markup,web-content,transformation,convert,md
home.PDFToMarkdown.title=PDF to Markdown
home.PDFToMarkdown.desc=Converts any PDF to Markdown
PDFToMarkdown.tags=markup,web-content,transformation,convert,md
home.getPdfInfo.title=Get ALL Info on PDF
home.getPdfInfo.desc=Grabs any and all information possible on PDFs
@@ -646,6 +649,11 @@ MarkdownToPDF.help=Work in progress
MarkdownToPDF.credit=Uses WeasyPrint
#pdf-to-markdown
PDFToMarkdown.title=PDF To Markdown
PDFToMarkdown.header=PDF To Markdown
PDFToMarkdown.submit=Convert
#url-to-pdf
URLToPDF.title=URL To PDF

View File

@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html th:lang="${#locale.language}" th:dir="#{language.direction}" th:data-language="${#locale.toString()}" xmlns:th="https://www.thymeleaf.org">
<head>
<th:block th:insert="~{fragments/common :: head(title=#{PDFToMarkdown.title}, header=#{PDFToMarkdown.header})}"></th:block>
</head>
<body>
<th:block th:insert="~{fragments/common :: game}"></th:block>
<div id="page-container">
<div id="content-wrap">
<th:block th:insert="~{fragments/navbar.html :: navbar}"></th:block>
<br><br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-6 bg-card">
<div class="tool-header">
<span class="material-symbols-rounded tool-header-icon convert">markdown_copy</span>
<span class="tool-header-text" th:text="#{PDFToMarkdown.header}"></span>
</div>
<form method="post" enctype="multipart/form-data" th:action="@{'/api/v1/convert/pdf/markdown'}">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multipleInputsForSingleRequest=false, accept='.pdf')}"></div>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{PDFToMarkdown.submit}"></button>
</form>
</div>
</div>
</div>
</div>
<th:block th:insert="~{fragments/footer.html :: footer}"></th:block>
</div>
</body>
</html>

View File

@@ -136,6 +136,9 @@
<div
th:replace="~{fragments/navbarEntry :: navbarEntry ('pdf-to-book', 'book', 'home.PDFToBook.title', 'home.PDFToBook.desc', 'PDFToBook.tags', 'convert')}">
</div>
<div
th:replace="~{fragments/navbarEntry :: navbarEntry ('pdf-to-markdown', 'markdown_copy', 'home.PDFToMarkdown.title', 'home.PDFToMarkdown.desc', 'PDFToMarkdown.tags', 'convert')}">
</div>
</div>
</div>
<!-- Security menu items -->

View File

@@ -192,6 +192,9 @@
<div
th:replace="~{fragments/card :: card(id='pdf-to-book', cardTitle=#{home.PDFToBook.title}, cardText=#{home.PDFToBook.desc}, cardLink='pdf-to-book', toolIcon='book', tags=#{PDFToBook.tags}, toolGroup='convert')}">
</div>
<div
th:replace="~{fragments/card :: card(id='pdf-to-markdown', cardTitle=#{home.PDFToMarkdown.title}, cardText=#{home.PDFToMarkdown.desc}, cardLink='pdf-to-markdown', toolIcon='markdown_copy', tags=#{PDFToMarkdown.tags}, toolGroup='convert')}">
</div>
</div>
</div>