Extract tables from PDF to CSV using Tabula (#2312)
* Add Tabula dependency and exclude slf4j-simple - Add tabula-java dependency to extract tables into CSV. - Exclude slf4j-simple due to Logback * Add a flexible CSVWriter - Add FlexibleCSVWriter which extends CSVWriter to pass a custom CSVFormat, as CSVWriter's parameterized constructor (that allows changing CSVFormat) is protected. * Use Tabula in extracting tables from PDF - Use Tabula in extracting tables from PDF instead of the existing implementation * Delete PDFTableStripper as It is unneeded - Delete PDFTableStripper as It is unneeded as Tabula-Java is used instead. * Use correct class in ExtractCSVController logger * Exclude gson and bcprov-jdk15on dependencies from tabula - Exclude gson and bcprov-jdk15on from tabula-java due to detected security vulnerabilities.
This commit is contained in:
committed by
GitHub
parent
faa8a9752c
commit
afad06bed4
@@ -1,12 +1,12 @@
|
||||
package stirling.software.SPDF.controller.api.converters;
|
||||
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.QuoteMode;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.ContentDisposition;
|
||||
@@ -18,79 +18,36 @@ import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import com.opencsv.CSVWriter;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import stirling.software.SPDF.controller.api.CropController;
|
||||
import stirling.software.SPDF.controller.api.strippers.PDFTableStripper;
|
||||
import stirling.software.SPDF.model.api.extract.PDFFilePage;
|
||||
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
|
||||
import technology.tabula.ObjectExtractor;
|
||||
import technology.tabula.Page;
|
||||
import technology.tabula.Table;
|
||||
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
||||
import technology.tabula.writers.Writer;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/convert")
|
||||
@Tag(name = "Convert", description = "Convert APIs")
|
||||
public class ExtractCSVController {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CropController.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);
|
||||
|
||||
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
|
||||
@Operation(
|
||||
summary = "Extracts a CSV document from a PDF",
|
||||
description =
|
||||
"This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||
@Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
|
||||
|
||||
ArrayList<String> tableData = new ArrayList<>();
|
||||
int columnsCount = 0;
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
||||
final double res = 72; // PDF units are at 72 DPI
|
||||
PDFTableStripper stripper = new PDFTableStripper();
|
||||
PDPage pdPage = document.getPage(form.getPageId() - 1);
|
||||
stripper.extractTable(pdPage);
|
||||
columnsCount = stripper.getColumns();
|
||||
for (int c = 0; c < columnsCount; ++c) {
|
||||
for (int r = 0; r < stripper.getRows(); ++r) {
|
||||
tableData.add(stripper.getText(r, c));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ArrayList<String> notEmptyColumns = new ArrayList<>();
|
||||
|
||||
for (String item : tableData) {
|
||||
if (!item.trim().isEmpty()) {
|
||||
notEmptyColumns.add(item);
|
||||
} else {
|
||||
columnsCount--;
|
||||
}
|
||||
}
|
||||
|
||||
List<String> fullTable =
|
||||
notEmptyColumns.stream()
|
||||
.map(
|
||||
(entity) ->
|
||||
entity.replace('\n', ' ')
|
||||
.replace('\r', ' ')
|
||||
.trim()
|
||||
.replaceAll("\\s{2,}", "|"))
|
||||
.toList();
|
||||
|
||||
int rowsCount = fullTable.get(0).split("\\|").length;
|
||||
|
||||
ArrayList<String> headersList = getTableHeaders(columnsCount, fullTable);
|
||||
ArrayList<String> recordList = getRecordsList(rowsCount, fullTable);
|
||||
|
||||
if (headersList.size() == 0 && recordList.size() == 0) {
|
||||
throw new Exception("No table detected, no headers or records found");
|
||||
}
|
||||
|
||||
StringWriter writer = new StringWriter();
|
||||
try (CSVWriter csvWriter = new CSVWriter(writer)) {
|
||||
csvWriter.writeNext(headersList.toArray(new String[0]));
|
||||
for (String record : recordList) {
|
||||
csvWriter.writeNext(record.split("\\|"));
|
||||
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
||||
CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
|
||||
Writer csvWriter = new FlexibleCSVWriter(format);
|
||||
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
|
||||
try (ObjectExtractor extractor = new ObjectExtractor(document)) {
|
||||
Page page = extractor.extract(form.getPageId());
|
||||
List<Table> tables = sea.extract(page);
|
||||
csvWriter.write(writer, tables);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,41 +56,12 @@ public class ExtractCSVController {
|
||||
ContentDisposition.builder("attachment")
|
||||
.filename(
|
||||
form.getFileInput()
|
||||
.getOriginalFilename()
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
.getOriginalFilename()
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_extracted.csv")
|
||||
.build());
|
||||
headers.setContentType(MediaType.parseMediaType("text/csv"));
|
||||
|
||||
return ResponseEntity.ok().headers(headers).body(writer.toString());
|
||||
}
|
||||
|
||||
private ArrayList<String> getRecordsList(int rowsCounts, List<String> items) {
|
||||
ArrayList<String> recordsList = new ArrayList<>();
|
||||
|
||||
for (int b = 1; b < rowsCounts; b++) {
|
||||
StringBuilder strbldr = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < items.size(); i++) {
|
||||
String[] parts = items.get(i).split("\\|");
|
||||
strbldr.append(parts[b]);
|
||||
if (i != items.size() - 1) {
|
||||
strbldr.append("|");
|
||||
}
|
||||
}
|
||||
recordsList.add(strbldr.toString());
|
||||
}
|
||||
|
||||
return recordsList;
|
||||
}
|
||||
|
||||
private ArrayList<String> getTableHeaders(int columnsCount, List<String> items) {
|
||||
ArrayList<String> resultList = new ArrayList<>();
|
||||
for (int i = 0; i < columnsCount; i++) {
|
||||
String[] parts = items.get(i).split("\\|");
|
||||
resultList.add(parts[0]);
|
||||
}
|
||||
|
||||
return resultList;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user