Add German OCR support to Paperless-ngx

- Created custom Dockerfile extending official paperless-ngx image
- Added tesseract-ocr-deu package for German language OCR
- Set PAPERLESS_OCR_LANGUAGE=deu+eng environment variable
- Updated CI/CD pipeline to build and push custom paperless image
- Modified deployment script to pull paperless image from GHCR
- Tested locally: German (deu) language pack now available alongside English
This commit is contained in:
Stiftung Development
2025-09-21 21:09:58 +02:00
parent 34b30be0a6
commit b8a6e99f07
4 changed files with 40 additions and 7 deletions

View File

@@ -165,7 +165,7 @@ jobs:
type=ref,event=pr type=ref,event=pr
type=sha,prefix={{branch}}- type=sha,prefix={{branch}}-
- name: Build and push Docker image - name: Build and push Docker images
uses: docker/build-push-action@v5 uses: docker/build-push-action@v5
with: with:
context: ./app context: ./app
@@ -173,6 +173,14 @@ jobs:
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
- name: Build and push Paperless image
uses: docker/build-push-action@v5
with:
context: ./paperless
push: true
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-paperless:latest
labels: ${{ steps.meta.outputs.labels }}
deploy: deploy:
needs: build needs: build
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -221,8 +229,8 @@ jobs:
echo "Attempting to pull images from GitHub Container Registry..." echo "Attempting to pull images from GitHub Container Registry..."
if echo $DEPLOY_TOKEN | docker login ghcr.io -u remmerinio --password-stdin; then if echo $DEPLOY_TOKEN | docker login ghcr.io -u remmerinio --password-stdin; then
echo "✅ Successfully logged into GHCR" echo "✅ Successfully logged into GHCR"
if docker-compose -f compose.yml pull web worker beat; then if docker-compose -f compose.yml pull web worker beat paperless; then
echo "✅ Successfully pulled web images from GHCR" echo "✅ Successfully pulled web and paperless images from GHCR"
USE_REMOTE_IMAGES=true USE_REMOTE_IMAGES=true
else else
echo "⚠️ Failed to pull images from GHCR, will build locally" echo "⚠️ Failed to pull images from GHCR, will build locally"
@@ -233,14 +241,15 @@ jobs:
USE_REMOTE_IMAGES=false USE_REMOTE_IMAGES=false
fi fi
# Pull other standard images (paperless, redis, postgres, grampsweb) # Pull other standard images (redis, postgres, grampsweb)
echo "Pulling standard Docker images..." echo "Pulling standard Docker images..."
docker-compose -f compose.yml pull db redis paperless grampsweb || echo "Some standard images failed to pull, will use cached versions" docker-compose -f compose.yml pull db redis grampsweb || echo "Some standard images failed to pull, will use cached versions"
# If we couldn't pull from GHCR, build locally # If we couldn't pull from GHCR, build locally
if [ "$USE_REMOTE_IMAGES" = "false" ]; then if [ "$USE_REMOTE_IMAGES" = "false" ]; then
echo "🔨 Building images locally from source code..." echo "🔨 Building images locally from source code..."
docker build -t ghcr.io/remmerinio/stiftung-management-system:latest ./app docker build -t ghcr.io/remmerinio/stiftung-management-system:latest ./app
docker build -t ghcr.io/remmerinio/stiftung-management-system-paperless:latest ./paperless
fi fi
# Stop containers and clean up # Stop containers and clean up

View File

@@ -54,7 +54,9 @@ services:
command: ["python", "manage.py", "runserver", "0.0.0.0:8000"] command: ["python", "manage.py", "runserver", "0.0.0.0:8000"]
paperless: paperless:
image: ghcr.io/paperless-ngx/paperless-ngx:latest build:
context: ./paperless
dockerfile: Dockerfile
ports: ports:
- "8082:8000" - "8082:8000"
environment: environment:

View File

@@ -110,7 +110,7 @@ services:
- redis - redis
paperless: paperless:
image: ghcr.io/paperless-ngx/paperless-ngx:latest image: ghcr.io/remmerinio/stiftung-management-system-paperless:latest
ports: ports:
- "8080:8000" - "8080:8000"
environment: environment:

22
paperless/Dockerfile Normal file
View File

@@ -0,0 +1,22 @@
# Custom Paperless-ngx with German OCR support
FROM ghcr.io/paperless-ngx/paperless-ngx:latest
# Switch to root to install packages
USER root
# Update package list and install German OCR language data
# The correct package name is tesseract-ocr-deu (not tesseract-data-deu)
RUN apt-get update && \
apt-get install -y \
tesseract-ocr-deu \
&& apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Verify German language pack is installed
RUN tesseract --list-langs | grep deu || echo "German language pack not found"
# Switch back to paperless user
USER paperless
# Set the OCR language to include German and English
ENV PAPERLESS_OCR_LANGUAGE=deu+eng