Pdf to markdown (#2730)
# Description Please provide a summary of the changes, including relevant motivation and context. Closes #(issue_number) ## Checklist - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have performed a self-review of my own code - [ ] I have attached images of the change if it is UI based - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] If my code has heavily changed functionality I have updated relevant docs on [Stirling-PDFs doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) - [ ] My changes generate no new warnings - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) --------- Co-authored-by: a <a>
This commit is contained in:
@@ -126,6 +126,7 @@ public class EndpointConfiguration {
|
||||
addEndpointToGroup("Convert", "url-to-pdf");
|
||||
addEndpointToGroup("Convert", "markdown-to-pdf");
|
||||
addEndpointToGroup("Convert", "pdf-to-csv");
|
||||
addEndpointToGroup("Convert", "pdf-to-markdown");
|
||||
|
||||
// Adding endpoints to "Security" group
|
||||
addEndpointToGroup("Security", "add-password");
|
||||
@@ -243,6 +244,7 @@ public class EndpointConfiguration {
|
||||
addEndpointToGroup("Java", REMOVE_BLANKS);
|
||||
addEndpointToGroup("Java", "pdf-to-text");
|
||||
addEndpointToGroup("Java", "remove-image-pdf");
|
||||
addEndpointToGroup("Java", "pdf-to-markdown");
|
||||
|
||||
// Javascript
|
||||
addEndpointToGroup("Javascript", "pdf-organizer");
|
||||
@@ -258,9 +260,11 @@ public class EndpointConfiguration {
|
||||
// Weasyprint dependent endpoints
|
||||
addEndpointToGroup("Weasyprint", "html-to-pdf");
|
||||
addEndpointToGroup("Weasyprint", "url-to-pdf");
|
||||
addEndpointToGroup("Weasyprint", "markdown-to-pdf");
|
||||
|
||||
// Pdftohtml dependent endpoints
|
||||
addEndpointToGroup("Pdftohtml", "pdf-to-html");
|
||||
addEndpointToGroup("Pdftohtml", "pdf-to-markdown");
|
||||
|
||||
// disabled for now while we resolve issues
|
||||
disableEndpoint("pdf-to-pdfa");
|
||||
|
||||
@@ -44,6 +44,13 @@ public class ConverterWebController {
|
||||
return "convert/markdown-to-pdf";
|
||||
}
|
||||
|
||||
@GetMapping("/pdf-to-markdown")
|
||||
@Hidden
|
||||
public String convertPdfToMarkdownForm(Model model) {
|
||||
model.addAttribute("currentPage", "pdf-to-markdown");
|
||||
return "convert/pdf-to-markdown";
|
||||
}
|
||||
|
||||
@GetMapping("/url-to-pdf")
|
||||
@Hidden
|
||||
public String convertURLToPdfForm(Model model) {
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
package stirling.software.SPDF.model.api.converters;
|
||||
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import stirling.software.SPDF.model.api.PDFFile;
|
||||
import stirling.software.SPDF.utils.PDFToFile;
|
||||
|
||||
@RestController
|
||||
@Tag(name = "Convert", description = "Convert APIs")
|
||||
@RequestMapping("/api/v1/convert")
|
||||
public class ConvertPDFToMarkdown {
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/pdf/markdown")
|
||||
@Operation(
|
||||
summary = "Convert PDF to Markdown",
|
||||
description =
|
||||
"This endpoint converts a PDF file to Markdown format. Input:PDF Output:Markdown Type:SISO")
|
||||
public ResponseEntity<byte[]> processPdfToMarkdown(@ModelAttribute PDFFile request)
|
||||
throws Exception {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
PDFToFile pdfToFile = new PDFToFile();
|
||||
return pdfToFile.processPdfToMarkdown(inputFile);
|
||||
}
|
||||
}
|
||||
@@ -20,6 +20,9 @@ import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
|
||||
import com.vladsch.flexmark.util.data.MutableDataSet;
|
||||
|
||||
import io.github.pixee.security.Filenames;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@@ -28,6 +31,123 @@ import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
||||
@Slf4j
|
||||
public class PDFToFile {
|
||||
|
||||
public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
|
||||
throws IOException, InterruptedException {
|
||||
if (!"application/pdf".equals(inputFile.getContentType())) {
|
||||
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
||||
}
|
||||
|
||||
MutableDataSet options =
|
||||
new MutableDataSet()
|
||||
.set(
|
||||
FlexmarkHtmlConverter.MAX_BLANK_LINES,
|
||||
2) // Control max consecutive blank lines
|
||||
.set(
|
||||
FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,
|
||||
1) // Control trailing blank lines
|
||||
.set(
|
||||
FlexmarkHtmlConverter.SETEXT_HEADINGS,
|
||||
true) // Use Setext headings for h1 and h2
|
||||
.set(
|
||||
FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,
|
||||
false) // Don't output HTML for unknown tags
|
||||
.set(
|
||||
FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,
|
||||
true) // Convert quotation marks
|
||||
.set(
|
||||
FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,
|
||||
true) // Convert <br> to paragraph breaks
|
||||
.set(FlexmarkHtmlConverter.CODE_INDENT, " "); // Indent for code blocks
|
||||
|
||||
FlexmarkHtmlConverter htmlToMarkdownConverter =
|
||||
FlexmarkHtmlConverter.builder(options).build();
|
||||
|
||||
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
||||
String pdfBaseName = originalPdfFileName;
|
||||
if (originalPdfFileName.contains(".")) {
|
||||
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
||||
}
|
||||
|
||||
Path tempInputFile = null;
|
||||
Path tempOutputDir = null;
|
||||
byte[] fileBytes;
|
||||
String fileName = "temp.file";
|
||||
|
||||
try {
|
||||
tempInputFile = Files.createTempFile("input_", ".pdf");
|
||||
inputFile.transferTo(tempInputFile);
|
||||
tempOutputDir = Files.createTempDirectory("output_");
|
||||
|
||||
List<String> command =
|
||||
new ArrayList<>(
|
||||
Arrays.asList(
|
||||
"pdftohtml",
|
||||
"-s",
|
||||
"-noframes",
|
||||
"-c",
|
||||
tempInputFile.toString(),
|
||||
pdfBaseName));
|
||||
|
||||
ProcessExecutorResult returnCode =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
|
||||
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
|
||||
// Process HTML files to Markdown
|
||||
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
|
||||
List<File> markdownFiles = new ArrayList<>();
|
||||
|
||||
// Convert HTML files to Markdown
|
||||
for (File outputFile : outputFiles) {
|
||||
if (outputFile.getName().endsWith(".html")) {
|
||||
String html = Files.readString(outputFile.toPath());
|
||||
String markdown = htmlToMarkdownConverter.convert(html);
|
||||
|
||||
String mdFileName = outputFile.getName().replace(".html", ".md");
|
||||
File mdFile = new File(tempOutputDir.toFile(), mdFileName);
|
||||
Files.writeString(mdFile.toPath(), markdown);
|
||||
markdownFiles.add(mdFile);
|
||||
}
|
||||
}
|
||||
|
||||
// If there's only one markdown file, return it directly
|
||||
if (markdownFiles.size() == 1) {
|
||||
fileName = pdfBaseName + ".md";
|
||||
fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
|
||||
} else {
|
||||
// Multiple files - create a zip
|
||||
fileName = pdfBaseName + "ToMarkdown.zip";
|
||||
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
|
||||
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
||||
// Add markdown files
|
||||
for (File mdFile : markdownFiles) {
|
||||
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
|
||||
zipOutputStream.putNextEntry(mdEntry);
|
||||
Files.copy(mdFile.toPath(), zipOutputStream);
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
|
||||
// Add images and other assets
|
||||
for (File file : outputFiles) {
|
||||
if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
|
||||
ZipEntry assetEntry = new ZipEntry(file.getName());
|
||||
zipOutputStream.putNextEntry(assetEntry);
|
||||
Files.copy(file.toPath(), zipOutputStream);
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fileBytes = byteArrayOutputStream.toByteArray();
|
||||
}
|
||||
|
||||
} finally {
|
||||
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
|
||||
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
||||
}
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
||||
}
|
||||
|
||||
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
|
||||
throws IOException, InterruptedException {
|
||||
if (!"application/pdf".equals(inputFile.getContentType())) {
|
||||
|
||||
@@ -450,8 +450,11 @@ HTMLToPDF.tags=markup,web-content,transformation,convert
|
||||
|
||||
home.MarkdownToPDF.title=Markdown to PDF
|
||||
home.MarkdownToPDF.desc=Converts any Markdown file to PDF
|
||||
MarkdownToPDF.tags=markup,web-content,transformation,convert
|
||||
MarkdownToPDF.tags=markup,web-content,transformation,convert,md
|
||||
|
||||
home.PDFToMarkdown.title=PDF to Markdown
|
||||
home.PDFToMarkdown.desc=Converts any PDF to Markdown
|
||||
PDFToMarkdown.tags=markup,web-content,transformation,convert,md
|
||||
|
||||
home.getPdfInfo.title=Get ALL Info on PDF
|
||||
home.getPdfInfo.desc=Grabs any and all information possible on PDFs
|
||||
@@ -646,6 +649,11 @@ MarkdownToPDF.help=Work in progress
|
||||
MarkdownToPDF.credit=Uses WeasyPrint
|
||||
|
||||
|
||||
#pdf-to-markdown
|
||||
PDFToMarkdown.title=PDF To Markdown
|
||||
PDFToMarkdown.header=PDF To Markdown
|
||||
PDFToMarkdown.submit=Convert
|
||||
|
||||
|
||||
#url-to-pdf
|
||||
URLToPDF.title=URL To PDF
|
||||
|
||||
@@ -450,8 +450,11 @@ HTMLToPDF.tags=markup,web-content,transformation,convert
|
||||
|
||||
home.MarkdownToPDF.title=Markdown to PDF
|
||||
home.MarkdownToPDF.desc=Converts any Markdown file to PDF
|
||||
MarkdownToPDF.tags=markup,web-content,transformation,convert
|
||||
MarkdownToPDF.tags=markup,web-content,transformation,convert,md
|
||||
|
||||
home.PDFToMarkdown.title=PDF to Markdown
|
||||
home.PDFToMarkdown.desc=Converts any PDF to Markdown
|
||||
PDFToMarkdown.tags=markup,web-content,transformation,convert,md
|
||||
|
||||
home.getPdfInfo.title=Get ALL Info on PDF
|
||||
home.getPdfInfo.desc=Grabs any and all information possible on PDFs
|
||||
@@ -646,6 +649,11 @@ MarkdownToPDF.help=Work in progress
|
||||
MarkdownToPDF.credit=Uses WeasyPrint
|
||||
|
||||
|
||||
#pdf-to-markdown
|
||||
PDFToMarkdown.title=PDF To Markdown
|
||||
PDFToMarkdown.header=PDF To Markdown
|
||||
PDFToMarkdown.submit=Convert
|
||||
|
||||
|
||||
#url-to-pdf
|
||||
URLToPDF.title=URL To PDF
|
||||
|
||||
31
src/main/resources/templates/convert/pdf-to-markdown.html
Normal file
31
src/main/resources/templates/convert/pdf-to-markdown.html
Normal file
@@ -0,0 +1,31 @@
|
||||
<!DOCTYPE html>
|
||||
<html th:lang="${#locale.language}" th:dir="#{language.direction}" th:data-language="${#locale.toString()}" xmlns:th="https://www.thymeleaf.org">
|
||||
<head>
|
||||
<th:block th:insert="~{fragments/common :: head(title=#{PDFToMarkdown.title}, header=#{PDFToMarkdown.header})}"></th:block>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<th:block th:insert="~{fragments/common :: game}"></th:block>
|
||||
<div id="page-container">
|
||||
<div id="content-wrap">
|
||||
<th:block th:insert="~{fragments/navbar.html :: navbar}"></th:block>
|
||||
<br><br>
|
||||
<div class="container">
|
||||
<div class="row justify-content-center">
|
||||
<div class="col-md-6 bg-card">
|
||||
<div class="tool-header">
|
||||
<span class="material-symbols-rounded tool-header-icon convert">markdown_copy</span>
|
||||
<span class="tool-header-text" th:text="#{PDFToMarkdown.header}"></span>
|
||||
</div>
|
||||
<form method="post" enctype="multipart/form-data" th:action="@{'/api/v1/convert/pdf/markdown'}">
|
||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multipleInputsForSingleRequest=false, accept='.pdf')}"></div>
|
||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{PDFToMarkdown.submit}"></button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<th:block th:insert="~{fragments/footer.html :: footer}"></th:block>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -136,6 +136,9 @@
|
||||
<div
|
||||
th:replace="~{fragments/navbarEntry :: navbarEntry ('pdf-to-book', 'book', 'home.PDFToBook.title', 'home.PDFToBook.desc', 'PDFToBook.tags', 'convert')}">
|
||||
</div>
|
||||
<div
|
||||
th:replace="~{fragments/navbarEntry :: navbarEntry ('pdf-to-markdown', 'markdown_copy', 'home.PDFToMarkdown.title', 'home.PDFToMarkdown.desc', 'PDFToMarkdown.tags', 'convert')}">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Security menu items -->
|
||||
|
||||
@@ -192,6 +192,9 @@
|
||||
<div
|
||||
th:replace="~{fragments/card :: card(id='pdf-to-book', cardTitle=#{home.PDFToBook.title}, cardText=#{home.PDFToBook.desc}, cardLink='pdf-to-book', toolIcon='book', tags=#{PDFToBook.tags}, toolGroup='convert')}">
|
||||
</div>
|
||||
<div
|
||||
th:replace="~{fragments/card :: card(id='pdf-to-markdown', cardTitle=#{home.PDFToMarkdown.title}, cardText=#{home.PDFToMarkdown.desc}, cardLink='pdf-to-markdown', toolIcon='markdown_copy', tags=#{PDFToMarkdown.tags}, toolGroup='convert')}">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user