From 1f29033f179a049d8f3b98e5954564247bfb9eb1 Mon Sep 17 00:00:00 2001
From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
Date: Sun, 10 Dec 2023 23:06:35 +0000
Subject: [PATCH 5/5] docker changes
---
Dockerfile | 2 +-
DockerfileBase | 52 ++++++++++----------
scripts/init.sh | 4 ++
src/main/resources/messages_ar_AR.properties | 3 --
src/main/resources/messages_bg_BG.properties | 3 --
src/main/resources/messages_ca_CA.properties | 3 --
src/main/resources/messages_de_DE.properties | 3 --
src/main/resources/messages_el_GR.properties | 3 --
src/main/resources/messages_en_US.properties | 3 --
src/main/resources/messages_es_ES.properties | 3 --
src/main/resources/messages_eu_ES.properties | 3 --
src/main/resources/messages_fr_FR.properties | 3 --
src/main/resources/messages_it_IT.properties | 3 --
src/main/resources/messages_ja_JP.properties | 3 --
src/main/resources/messages_ko_KR.properties | 3 --
src/main/resources/messages_nl_NL.properties | 3 --
src/main/resources/messages_pl_PL.properties | 3 --
src/main/resources/messages_ru_RU.properties | 3 --
src/main/resources/messages_sv_SE.properties | 3 --
src/main/resources/messages_tr_TR.properties | 3 --
src/main/resources/messages_zh_CN.properties | 3 --
21 files changed, 32 insertions(+), 80 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index f66390f4..ef2d6d90 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
# Use the base image
-FROM frooodle/stirling-pdf-base:testDontUseMe
+FROM frooodle/stirling-pdf-base:version6
ARG VERSION_TAG
diff --git a/DockerfileBase b/DockerfileBase
index 8027ac8b..7662e229 100644
--- a/DockerfileBase
+++ b/DockerfileBase
@@ -1,48 +1,50 @@
# Main stage
FROM ubuntu:latest AS base
-RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2
-RUN add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr
+# JDK for app
RUN apt-get update && \
apt-get install -y --no-install-recommends \
- openjdk-17-jre \
- libreoffice-core-nogui \
+ openjdk-17-jre
+
+
+# Doc conversion
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ libreoffice-core-nogui \
libreoffice-common \
libreoffice-writer-nogui \
libreoffice-calc-nogui \
libreoffice-impress-nogui \
- python3-uno \
+ python3-uno \
+ unoconv
+
+
+# OCR MY PDF (unpaper for descew and other advanced featues)
+RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2 && \
+add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr && \
+apt-get update && \
+ apt-get install -y --no-install-recommends \
ghostscript \
python3-pip \
ocrmypdf \
- unoconv && \
+ unpaper && \
pip install --upgrade pip && \
- pip install --no-cache-dir --user --upgrade ocrmypdf && \
- pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1 \
pip install --no-cache-dir --upgrade ocrmypdf && \
- pip install --no-cache-dir \
- opencv-python-headless && \
- rm -rf /var/lib/apt/lists/* && \
+ pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1
+
+
+#CV
+RUN pip install --no-cache-dir opencv-python-headless
+
+
+# cleanup and etc
+RUN rm -rf /var/lib/apt/lists/* && \
mkdir /usr/share/tesseract-ocr-original && \
cp -r /usr/share/tesseract-ocr/* /usr/share/tesseract-ocr-original && \
rm -rf /usr/share/tesseract-ocr
-# Python packages stage
-FROM base AS python-packages
-# Install build tools and Python libraries
-RUN apt-get update && \
- apt-get install -y --no-install-recommends \
- build-essential \
- libffi-dev \
- libssl-dev \
- zlib1g-dev \
- libjpeg-dev
-
-# Final stage: Copy necessary files from the previous stage
-FROM base
-COPY --from=python-packages /usr/local /usr/local
\ No newline at end of file
diff --git a/scripts/init.sh b/scripts/init.sh
index e65914c4..80a13785 100644
--- a/scripts/init.sh
+++ b/scripts/init.sh
@@ -5,6 +5,10 @@ echo "Copying original files without overwriting existing files"
mkdir -p /usr/share/tesseract-ocr
cp -rn /usr/share/tesseract-ocr-original/* /usr/share/tesseract-ocr
+if [ -d /usr/share/tesseract-ocr/4.00/tessdata ]; then
+ cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tesseract-ocr/5/tessdata/ || true;
+fi
+
# Check if TESSERACT_LANGS environment variable is set and is not empty
if [[ -n "$TESSERACT_LANGS" ]]; then
# Convert comma-separated values to a space-separated list
diff --git a/src/main/resources/messages_ar_AR.properties b/src/main/resources/messages_ar_AR.properties
index 9e22527c..a3d2a451 100644
--- a/src/main/resources/messages_ar_AR.properties
+++ b/src/main/resources/messages_ar_AR.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=تحويل
#PDFToCSV
PDFToCSV.title=PDF ??? CSV
PDFToCSV.header=PDF ??? CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=??????
diff --git a/src/main/resources/messages_bg_BG.properties b/src/main/resources/messages_bg_BG.properties
index 7114bb8f..c0c327ed 100644
--- a/src/main/resources/messages_bg_BG.properties
+++ b/src/main/resources/messages_bg_BG.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Преобразуване
#PDFToCSV
PDFToCSV.title=PDF ??? CSV
PDFToCSV.header=PDF ??? CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=????????
diff --git a/src/main/resources/messages_ca_CA.properties b/src/main/resources/messages_ca_CA.properties
index fd2f5adf..4a650f98 100644
--- a/src/main/resources/messages_ca_CA.properties
+++ b/src/main/resources/messages_ca_CA.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Converteix
#PDFToCSV
PDFToCSV.title=PDF a CSV
PDFToCSV.header=PDF a CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extracte
diff --git a/src/main/resources/messages_de_DE.properties b/src/main/resources/messages_de_DE.properties
index c5722ce2..1777bfdf 100644
--- a/src/main/resources/messages_de_DE.properties
+++ b/src/main/resources/messages_de_DE.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Konvertieren
#PDFToCSV
PDFToCSV.title=PDF zu CSV
PDFToCSV.header=PDF zu CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extrakt
diff --git a/src/main/resources/messages_el_GR.properties b/src/main/resources/messages_el_GR.properties
index f4970b65..119ec1fc 100644
--- a/src/main/resources/messages_el_GR.properties
+++ b/src/main/resources/messages_el_GR.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=\u039C\u03B5\u03C4\u03B1\u03C4\u03C1\u03BF\u03C0\u03AE
#PDFToCSV
PDFToCSV.title=PDF ?? CSV
PDFToCSV.header=PDF ?? CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=?????????
diff --git a/src/main/resources/messages_en_US.properties b/src/main/resources/messages_en_US.properties
index 3963d55c..db116c98 100644
--- a/src/main/resources/messages_en_US.properties
+++ b/src/main/resources/messages_en_US.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Convert
#PDFToCSV
PDFToCSV.title=PDF to CSV
PDFToCSV.header=PDF to CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extract
diff --git a/src/main/resources/messages_es_ES.properties b/src/main/resources/messages_es_ES.properties
index b60e2907..7828b717 100644
--- a/src/main/resources/messages_es_ES.properties
+++ b/src/main/resources/messages_es_ES.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Convertir
#PDFToCSV
PDFToCSV.title=PDF a CSV
PDFToCSV.header=PDF a CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extracto
diff --git a/src/main/resources/messages_eu_ES.properties b/src/main/resources/messages_eu_ES.properties
index 4a26b24a..528f22f3 100644
--- a/src/main/resources/messages_eu_ES.properties
+++ b/src/main/resources/messages_eu_ES.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Bihurtu
#PDFToCSV
PDFToCSV.title=PDF a CSV
PDFToCSV.header=PDF a CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extracto
diff --git a/src/main/resources/messages_fr_FR.properties b/src/main/resources/messages_fr_FR.properties
index d4e1fb7b..c6b59d0c 100644
--- a/src/main/resources/messages_fr_FR.properties
+++ b/src/main/resources/messages_fr_FR.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Convertir
#PDFToCSV
PDFToCSV.title=PDF en CSV
PDFToCSV.header=PDF en CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extrait
diff --git a/src/main/resources/messages_it_IT.properties b/src/main/resources/messages_it_IT.properties
index ee13c595..101664bb 100644
--- a/src/main/resources/messages_it_IT.properties
+++ b/src/main/resources/messages_it_IT.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Converti
#PDFToCSV
PDFToCSV.title=Da PDF a CSV
PDFToCSV.header=Da PDF a CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Estratto
diff --git a/src/main/resources/messages_ja_JP.properties b/src/main/resources/messages_ja_JP.properties
index a37d25a1..68084f14 100644
--- a/src/main/resources/messages_ja_JP.properties
+++ b/src/main/resources/messages_ja_JP.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=変換
#PDFToCSV
PDFToCSV.title=PDF??CSV?
PDFToCSV.header=PDF??CSV?
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=????
diff --git a/src/main/resources/messages_ko_KR.properties b/src/main/resources/messages_ko_KR.properties
index a1d38f09..340b0839 100644
--- a/src/main/resources/messages_ko_KR.properties
+++ b/src/main/resources/messages_ko_KR.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=변환
#PDFToCSV
PDFToCSV.title=PDF? CSV?
PDFToCSV.header=PDF? CSV?
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=??
diff --git a/src/main/resources/messages_nl_NL.properties b/src/main/resources/messages_nl_NL.properties
index c08e3844..3ae32e8b 100644
--- a/src/main/resources/messages_nl_NL.properties
+++ b/src/main/resources/messages_nl_NL.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Converteren
#PDFToCSV
PDFToCSV.title=PDF naar CSV
PDFToCSV.header=PDF naar CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extract
diff --git a/src/main/resources/messages_pl_PL.properties b/src/main/resources/messages_pl_PL.properties
index 89758125..2e76194a 100644
--- a/src/main/resources/messages_pl_PL.properties
+++ b/src/main/resources/messages_pl_PL.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Konwertuj
#PDFToCSV
PDFToCSV.title=PDF na CSV
PDFToCSV.header=PDF na CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Wyci?g
diff --git a/src/main/resources/messages_ru_RU.properties b/src/main/resources/messages_ru_RU.properties
index b3ae01db..a4d566ad 100644
--- a/src/main/resources/messages_ru_RU.properties
+++ b/src/main/resources/messages_ru_RU.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Конвертировать
#PDFToCSV
PDFToCSV.title=PDF ? CSV
PDFToCSV.header=PDF ? CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=???????
diff --git a/src/main/resources/messages_sv_SE.properties b/src/main/resources/messages_sv_SE.properties
index 48ba7943..569e8d8a 100644
--- a/src/main/resources/messages_sv_SE.properties
+++ b/src/main/resources/messages_sv_SE.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Konvertera
#PDFToCSV
PDFToCSV.title=PDF till CSV
PDFToCSV.header=PDF till CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Navvit
diff --git a/src/main/resources/messages_tr_TR.properties b/src/main/resources/messages_tr_TR.properties
index 395f6abb..d02ae139 100644
--- a/src/main/resources/messages_tr_TR.properties
+++ b/src/main/resources/messages_tr_TR.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=Dönüştür
#PDFToCSV
PDFToCSV.title=PDF to CSV
PDFToCSV.header=PDF to CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=Extract
diff --git a/src/main/resources/messages_zh_CN.properties b/src/main/resources/messages_zh_CN.properties
index 6b18d1d9..398530fd 100644
--- a/src/main/resources/messages_zh_CN.properties
+++ b/src/main/resources/messages_zh_CN.properties
@@ -831,8 +831,5 @@ PDFToXML.submit=转换
#PDFToCSV
PDFToCSV.title=PDF ? CSV
PDFToCSV.header=PDF ? CSV
-##########################
-### TODO: Translate ###
-##########################
PDFToCSV.prompt=Choose page to extract table
PDFToCSV.submit=??