Pdf to markdown (#2730)

# Description Please provide a summary of the changes, including relevant motivation and context. Closes #(issue_number) ## Checklist - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have performed a self-review of my own code - [ ] I have attached images of the change if it is UI based - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] If my code has heavily changed functionality I have updated relevant docs on [Stirling-PDFs doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) - [ ] My changes generate no new warnings - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) --------- Co-authored-by: a <a>
2025-01-17 22:18:55 +00:00
parent a64acb3126
commit a46a570c8a
15 changed files with 1990 additions and 1723 deletions
@@ -126,6 +126,7 @@ public class EndpointConfiguration {
        addEndpointToGroup("Convert", "url-to-pdf");
        addEndpointToGroup("Convert", "markdown-to-pdf");
        addEndpointToGroup("Convert", "pdf-to-csv");
+        addEndpointToGroup("Convert", "pdf-to-markdown");

        // Adding endpoints to "Security" group
        addEndpointToGroup("Security", "add-password");
@@ -243,6 +244,7 @@ public class EndpointConfiguration {
        addEndpointToGroup("Java", REMOVE_BLANKS);
        addEndpointToGroup("Java", "pdf-to-text");
        addEndpointToGroup("Java", "remove-image-pdf");
+        addEndpointToGroup("Java", "pdf-to-markdown");

        // Javascript
        addEndpointToGroup("Javascript", "pdf-organizer");
@@ -258,9 +260,11 @@ public class EndpointConfiguration {
        // Weasyprint dependent endpoints
        addEndpointToGroup("Weasyprint", "html-to-pdf");
        addEndpointToGroup("Weasyprint", "url-to-pdf");
+        addEndpointToGroup("Weasyprint", "markdown-to-pdf");

        // Pdftohtml dependent endpoints
        addEndpointToGroup("Pdftohtml", "pdf-to-html");
+        addEndpointToGroup("Pdftohtml", "pdf-to-markdown");

        // disabled for now while we resolve issues
        disableEndpoint("pdf-to-pdfa");
@@ -44,6 +44,13 @@ public class ConverterWebController {
        return "convert/markdown-to-pdf";
    }

+    @GetMapping("/pdf-to-markdown")
+    @Hidden
+    public String convertPdfToMarkdownForm(Model model) {
+        model.addAttribute("currentPage", "pdf-to-markdown");
+        return "convert/pdf-to-markdown";
+    }
+
    @GetMapping("/url-to-pdf")
    @Hidden
    public String convertURLToPdfForm(Model model) {
@@ -0,0 +1,32 @@
+package stirling.software.SPDF.model.api.converters;
+
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.ModelAttribute;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RestController;
+import org.springframework.web.multipart.MultipartFile;
+
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.tags.Tag;
+
+import stirling.software.SPDF.model.api.PDFFile;
+import stirling.software.SPDF.utils.PDFToFile;
+
+@RestController
+@Tag(name = "Convert", description = "Convert APIs")
+@RequestMapping("/api/v1/convert")
+public class ConvertPDFToMarkdown {
+
+    @PostMapping(consumes = "multipart/form-data", value = "/pdf/markdown")
+    @Operation(
+            summary = "Convert PDF to Markdown",
+            description =
+                    "This endpoint converts a PDF file to Markdown format. Input:PDF Output:Markdown Type:SISO")
+    public ResponseEntity<byte[]> processPdfToMarkdown(@ModelAttribute PDFFile request)
+            throws Exception {
+        MultipartFile inputFile = request.getFileInput();
+        PDFToFile pdfToFile = new PDFToFile();
+        return pdfToFile.processPdfToMarkdown(inputFile);
+    }
+}
@@ -20,6 +20,9 @@ import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.multipart.MultipartFile;

+import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
+import com.vladsch.flexmark.util.data.MutableDataSet;
+
 import io.github.pixee.security.Filenames;

 import lombok.extern.slf4j.Slf4j;
@@ -28,6 +31,123 @@ import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
@Slf4j
 public class PDFToFile {

+    public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
+            throws IOException, InterruptedException {
+        if (!"application/pdf".equals(inputFile.getContentType())) {
+            return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
+        }
+
+        MutableDataSet options =
+                new MutableDataSet()
+                        .set(
+                                FlexmarkHtmlConverter.MAX_BLANK_LINES,
+                                2) // Control max consecutive blank lines
+                        .set(
+                                FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,
+                                1) // Control trailing blank lines
+                        .set(
+                                FlexmarkHtmlConverter.SETEXT_HEADINGS,
+                                true) // Use Setext headings for h1 and h2
+                        .set(
+                                FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,
+                                false) // Don't output HTML for unknown tags
+                        .set(
+                                FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,
+                                true) // Convert quotation marks
+                        .set(
+                                FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,
+                                true) // Convert <br> to paragraph breaks
+                        .set(FlexmarkHtmlConverter.CODE_INDENT, "    "); // Indent for code blocks
+
+        FlexmarkHtmlConverter htmlToMarkdownConverter =
+                FlexmarkHtmlConverter.builder(options).build();
+
+        String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
+        String pdfBaseName = originalPdfFileName;
+        if (originalPdfFileName.contains(".")) {
+            pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
+        }
+
+        Path tempInputFile = null;
+        Path tempOutputDir = null;
+        byte[] fileBytes;
+        String fileName = "temp.file";
+
+        try {
+            tempInputFile = Files.createTempFile("input_", ".pdf");
+            inputFile.transferTo(tempInputFile);
+            tempOutputDir = Files.createTempDirectory("output_");
+
+            List<String> command =
+                    new ArrayList<>(
+                            Arrays.asList(
+                                    "pdftohtml",
+                                    "-s",
+                                    "-noframes",
+                                    "-c",
+                                    tempInputFile.toString(),
+                                    pdfBaseName));
+
+            ProcessExecutorResult returnCode =
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
+                            .runCommandWithOutputHandling(command, tempOutputDir.toFile());
+            // Process HTML files to Markdown
+            File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
+            List<File> markdownFiles = new ArrayList<>();
+
+            // Convert HTML files to Markdown
+            for (File outputFile : outputFiles) {
+                if (outputFile.getName().endsWith(".html")) {
+                    String html = Files.readString(outputFile.toPath());
+                    String markdown = htmlToMarkdownConverter.convert(html);
+
+                    String mdFileName = outputFile.getName().replace(".html", ".md");
+                    File mdFile = new File(tempOutputDir.toFile(), mdFileName);
+                    Files.writeString(mdFile.toPath(), markdown);
+                    markdownFiles.add(mdFile);
+                }
+            }
+
+            // If there's only one markdown file, return it directly
+            if (markdownFiles.size() == 1) {
+                fileName = pdfBaseName + ".md";
+                fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
+            } else {
+                // Multiple files - create a zip
+                fileName = pdfBaseName + "ToMarkdown.zip";
+                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
+
+                try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
+                    // Add markdown files
+                    for (File mdFile : markdownFiles) {
+                        ZipEntry mdEntry = new ZipEntry(mdFile.getName());
+                        zipOutputStream.putNextEntry(mdEntry);
+                        Files.copy(mdFile.toPath(), zipOutputStream);
+                        zipOutputStream.closeEntry();
+                    }
+
+                    // Add images and other assets
+                    for (File file : outputFiles) {
+                        if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
+                            ZipEntry assetEntry = new ZipEntry(file.getName());
+                            zipOutputStream.putNextEntry(assetEntry);
+                            Files.copy(file.toPath(), zipOutputStream);
+                            zipOutputStream.closeEntry();
+                        }
+                    }
+                }
+
+                fileBytes = byteArrayOutputStream.toByteArray();
+            }
+
+        } finally {
+            if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
+            if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
+        }
+        return WebResponseUtils.bytesToWebResponse(
+                fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
+    }
+
    public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
            throws IOException, InterruptedException {
        if (!"application/pdf".equals(inputFile.getContentType())) {
@@ -450,8 +450,11 @@ HTMLToPDF.tags=markup,web-content,transformation,convert

 home.MarkdownToPDF.title=Markdown to PDF
 home.MarkdownToPDF.desc=Converts any Markdown file to PDF
-MarkdownToPDF.tags=markup,web-content,transformation,convert
+MarkdownToPDF.tags=markup,web-content,transformation,convert,md

+home.PDFToMarkdown.title=PDF to Markdown
+home.PDFToMarkdown.desc=Converts any PDF to Markdown
+PDFToMarkdown.tags=markup,web-content,transformation,convert,md

 home.getPdfInfo.title=Get ALL Info on PDF
 home.getPdfInfo.desc=Grabs any and all information possible on PDFs
@@ -646,6 +649,11 @@ MarkdownToPDF.help=Work in progress
 MarkdownToPDF.credit=Uses WeasyPrint


+#pdf-to-markdown
+PDFToMarkdown.title=PDF To Markdown
+PDFToMarkdown.header=PDF To Markdown
+PDFToMarkdown.submit=Convert
+

 #url-to-pdf
 URLToPDF.title=URL To PDF
@@ -450,8 +450,11 @@ HTMLToPDF.tags=markup,web-content,transformation,convert

 home.MarkdownToPDF.title=Markdown to PDF
 home.MarkdownToPDF.desc=Converts any Markdown file to PDF
-MarkdownToPDF.tags=markup,web-content,transformation,convert
+MarkdownToPDF.tags=markup,web-content,transformation,convert,md

+home.PDFToMarkdown.title=PDF to Markdown
+home.PDFToMarkdown.desc=Converts any PDF to Markdown
+PDFToMarkdown.tags=markup,web-content,transformation,convert,md

 home.getPdfInfo.title=Get ALL Info on PDF
 home.getPdfInfo.desc=Grabs any and all information possible on PDFs
@@ -646,6 +649,11 @@ MarkdownToPDF.help=Work in progress
 MarkdownToPDF.credit=Uses WeasyPrint


+#pdf-to-markdown
+PDFToMarkdown.title=PDF To Markdown
+PDFToMarkdown.header=PDF To Markdown
+PDFToMarkdown.submit=Convert
+

 #url-to-pdf
 URLToPDF.title=URL To PDF
@@ -0,0 +1,31 @@
+<!DOCTYPE html>
+<html th:lang="${#locale.language}" th:dir="#{language.direction}" th:data-language="${#locale.toString()}" xmlns:th="https://www.thymeleaf.org">
+<head>
+    <th:block th:insert="~{fragments/common :: head(title=#{PDFToMarkdown.title}, header=#{PDFToMarkdown.header})}"></th:block>
+</head>
+
+<body>
+    <th:block th:insert="~{fragments/common :: game}"></th:block>
+    <div id="page-container">
+        <div id="content-wrap">
+            <th:block th:insert="~{fragments/navbar.html :: navbar}"></th:block>
+            <br><br>
+            <div class="container">
+                <div class="row justify-content-center">
+                    <div class="col-md-6 bg-card">
+                        <div class="tool-header">
+                            <span class="material-symbols-rounded tool-header-icon convert">markdown_copy</span>
+                            <span class="tool-header-text" th:text="#{PDFToMarkdown.header}"></span>
+                        </div>
+                        <form method="post" enctype="multipart/form-data" th:action="@{'/api/v1/convert/pdf/markdown'}">
+                            <div th:replace="~{fragments/common :: fileSelector(name='fileInput', multipleInputsForSingleRequest=false, accept='.pdf')}"></div>
+                            <button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{PDFToMarkdown.submit}"></button>
+                        </form>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <th:block th:insert="~{fragments/footer.html :: footer}"></th:block>
+    </div>
+</body>
+</html>
@@ -136,6 +136,9 @@
                          <div
                            th:replace="~{fragments/navbarEntry :: navbarEntry ('pdf-to-book', 'book', 'home.PDFToBook.title', 'home.PDFToBook.desc', 'PDFToBook.tags', 'convert')}">
                          </div>
+                          <div
+                            th:replace="~{fragments/navbarEntry :: navbarEntry ('pdf-to-markdown', 'markdown_copy', 'home.PDFToMarkdown.title', 'home.PDFToMarkdown.desc', 'PDFToMarkdown.tags', 'convert')}">
+                          </div>
                        </div>
                      </div>
                      <!-- Security menu items -->
@@ -192,6 +192,9 @@
              <div
                th:replace="~{fragments/card :: card(id='pdf-to-book', cardTitle=#{home.PDFToBook.title}, cardText=#{home.PDFToBook.desc}, cardLink='pdf-to-book', toolIcon='book', tags=#{PDFToBook.tags}, toolGroup='convert')}">
              </div>
+              <div
+                th:replace="~{fragments/card :: card(id='pdf-to-markdown', cardTitle=#{home.PDFToMarkdown.title}, cardText=#{home.PDFToMarkdown.desc}, cardLink='pdf-to-markdown', toolIcon='markdown_copy', tags=#{PDFToMarkdown.tags}, toolGroup='convert')}">
+              </div>
            </div>
          </div>