From b8a6e99f079abfd1f5ba0604fcad14c6cf43b95c Mon Sep 17 00:00:00 2001 From: Stiftung Development Date: Sun, 21 Sep 2025 21:09:58 +0200 Subject: [PATCH] Add German OCR support to Paperless-ngx - Created custom Dockerfile extending official paperless-ngx image - Added tesseract-ocr-deu package for German language OCR - Set PAPERLESS_OCR_LANGUAGE=deu+eng environment variable - Updated CI/CD pipeline to build and push custom paperless image - Modified deployment script to pull paperless image from GHCR - Tested locally: German (deu) language pack now available alongside English --- .github/workflows/ci-cd.yml | 19 ++++++++++++++----- compose.dev.yml | 4 +++- compose.yml | 2 +- paperless/Dockerfile | 22 ++++++++++++++++++++++ 4 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 paperless/Dockerfile diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml index 91fe891..1efc66a 100644 --- a/.github/workflows/ci-cd.yml +++ b/.github/workflows/ci-cd.yml @@ -165,7 +165,7 @@ jobs: type=ref,event=pr type=sha,prefix={{branch}}- - - name: Build and push Docker image + - name: Build and push Docker images uses: docker/build-push-action@v5 with: context: ./app @@ -173,6 +173,14 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + - name: Build and push Paperless image + uses: docker/build-push-action@v5 + with: + context: ./paperless + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-paperless:latest + labels: ${{ steps.meta.outputs.labels }} + deploy: needs: build runs-on: ubuntu-latest @@ -221,8 +229,8 @@ jobs: echo "Attempting to pull images from GitHub Container Registry..." if echo $DEPLOY_TOKEN | docker login ghcr.io -u remmerinio --password-stdin; then echo "✅ Successfully logged into GHCR" - if docker-compose -f compose.yml pull web worker beat; then - echo "✅ Successfully pulled web images from GHCR" + if docker-compose -f compose.yml pull web worker beat paperless; then + echo "✅ Successfully pulled web and paperless images from GHCR" USE_REMOTE_IMAGES=true else echo "⚠️ Failed to pull images from GHCR, will build locally" @@ -233,14 +241,15 @@ jobs: USE_REMOTE_IMAGES=false fi - # Pull other standard images (paperless, redis, postgres, grampsweb) + # Pull other standard images (redis, postgres, grampsweb) echo "Pulling standard Docker images..." - docker-compose -f compose.yml pull db redis paperless grampsweb || echo "Some standard images failed to pull, will use cached versions" + docker-compose -f compose.yml pull db redis grampsweb || echo "Some standard images failed to pull, will use cached versions" # If we couldn't pull from GHCR, build locally if [ "$USE_REMOTE_IMAGES" = "false" ]; then echo "🔨 Building images locally from source code..." docker build -t ghcr.io/remmerinio/stiftung-management-system:latest ./app + docker build -t ghcr.io/remmerinio/stiftung-management-system-paperless:latest ./paperless fi # Stop containers and clean up diff --git a/compose.dev.yml b/compose.dev.yml index 3ee1f3c..bebced6 100644 --- a/compose.dev.yml +++ b/compose.dev.yml @@ -54,7 +54,9 @@ services: command: ["python", "manage.py", "runserver", "0.0.0.0:8000"] paperless: - image: ghcr.io/paperless-ngx/paperless-ngx:latest + build: + context: ./paperless + dockerfile: Dockerfile ports: - "8082:8000" environment: diff --git a/compose.yml b/compose.yml index 4b7a461..81df2a6 100644 --- a/compose.yml +++ b/compose.yml @@ -110,7 +110,7 @@ services: - redis paperless: - image: ghcr.io/paperless-ngx/paperless-ngx:latest + image: ghcr.io/remmerinio/stiftung-management-system-paperless:latest ports: - "8080:8000" environment: diff --git a/paperless/Dockerfile b/paperless/Dockerfile new file mode 100644 index 0000000..b9c40f9 --- /dev/null +++ b/paperless/Dockerfile @@ -0,0 +1,22 @@ +# Custom Paperless-ngx with German OCR support +FROM ghcr.io/paperless-ngx/paperless-ngx:latest + +# Switch to root to install packages +USER root + +# Update package list and install German OCR language data +# The correct package name is tesseract-ocr-deu (not tesseract-data-deu) +RUN apt-get update && \ + apt-get install -y \ + tesseract-ocr-deu \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Verify German language pack is installed +RUN tesseract --list-langs | grep deu || echo "German language pack not found" + +# Switch back to paperless user +USER paperless + +# Set the OCR language to include German and English +ENV PAPERLESS_OCR_LANGUAGE=deu+eng \ No newline at end of file