Extract tables from PDF to CSV using Tabula (#2312)

* Add Tabula dependency and exclude slf4j-simple - Add tabula-java dependency to extract tables into CSV. - Exclude slf4j-simple due to Logback * Add a flexible CSVWriter - Add FlexibleCSVWriter which extends CSVWriter to pass a custom CSVFormat, as CSVWriter's parameterized constructor (that allows changing CSVFormat) is protected. * Use Tabula in extracting tables from PDF - Use Tabula in extracting tables from PDF instead of the existing implementation * Delete PDFTableStripper as It is unneeded - Delete PDFTableStripper as It is unneeded as Tabula-Java is used instead. * Use correct class in ExtractCSVController logger * Exclude gson and bcprov-jdk15on dependencies from tabula - Exclude gson and bcprov-jdk15on from tabula-java due to detected security vulnerabilities.
2024-11-24 01:28:44 +02:00
parent faa8a9752c
commit afad06bed4
4 changed files with 43 additions and 419 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -203,6 +203,13 @@ dependencies {
        exclude group: "commons-logging", module: "commons-logging"
    }

+    // https://mvnrepository.com/artifact/technology.tabula/tabula
+    implementation ('technology.tabula:tabula:1.0.5')  {
+        exclude group: "org.slf4j", module: "slf4j-simple"
+        exclude group: "org.bouncycastle", module: "bcprov-jdk15on"
+        exclude group: "com.google.code.gson", module: "gson"
+    }
+
    implementation 'org.apache.pdfbox:jbig2-imageio:3.0.4'

    implementation "org.bouncycastle:bcprov-jdk18on:$bouncycastleVersion"