From 90142396d8be5f2e74341d927a91a6f481893530 Mon Sep 17 00:00:00 2001 From: mAi Date: Sat, 16 May 2026 17:57:26 +0200 Subject: [PATCH] mAi: #2 - mdms-mover: strip blank pages from duplex scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes: 1. Migrate mover from m/otto (commit 9974937, otto#438) into this repo at infra/mdms-mover/. mover.sh, mdms-mover.service, mdms-mover.timer, README.md. Matches the live deployment on mDock byte-for-byte (modulo the strip step below). 2. Add blank-page stripping before the inbox → toprocess promotion. A page is dropped iff its embedded text is empty AND its rendered thumbnail is >= MDMS_BLANK_THRESHOLD near-white pixels (default 0.97 per issue #2). Detects the empty backside of patch-T separator sheets in duplex scans (mDMS#2). strip_blank_pages.py uses PyMuPDF as the only Python dep — single self-contained wheel, no `poppler-utils` apt-install on mdock. Mirrors the uv-inline-deps single-file pattern of infra/paperless/generate_separator.py. Edge cases: - 1-page input: strip skipped entirely. - All pages would drop: script exits 2, mover keeps file in inbox and logs WARNING (no empty doc reaches Paperless). - Strip script errors: mover falls back to plain mv, no scan blocked. - MDMS_STRIP_BLANK=false: bypass strip entirely (emergency disable). Deploy: rsync uv binary to mdock ~/.local/bin/uv (single static binary, user-space, no apt), scp script + units, systemctl --user daemon-reload. Verified live with synthetic 4-page (2 real + 1 blank + 1 real → 3 pages), 1-page (unchanged), all-blank (kept in inbox + warning) test PDFs. Timer fires every ~70s as before. --- .gitignore | 1 + infra/mdms-mover/README.md | 180 ++++++++++++++++++++++++++ infra/mdms-mover/mdms-mover.service | 11 ++ infra/mdms-mover/mdms-mover.timer | 12 ++ infra/mdms-mover/mover.sh | 93 +++++++++++++ infra/mdms-mover/strip_blank_pages.py | 122 +++++++++++++++++ 6 files changed, 419 insertions(+) create mode 100644 .gitignore create mode 100644 infra/mdms-mover/README.md create mode 100644 infra/mdms-mover/mdms-mover.service create mode 100644 infra/mdms-mover/mdms-mover.timer create mode 100755 infra/mdms-mover/mover.sh create mode 100755 infra/mdms-mover/strip_blank_pages.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8ebfac7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.m/ diff --git a/infra/mdms-mover/README.md b/infra/mdms-mover/README.md new file mode 100644 index 0000000..7688e38 --- /dev/null +++ b/infra/mdms-mover/README.md @@ -0,0 +1,180 @@ +# mdms-mover — age-gated inbox → toprocess promoter + blank-page stripper + +Two jobs in one user-systemd timer: + +1. **Stability gate** (otto#438): solves the chunk-write race between the + Canon MB5100 (SMB scans land in `/mnt/mdms/inbox/` in pieces) and + Paperless (polls `/mnt/mdms/toprocess/` every 60s and consumes + anything it sees). A file is only promoted when **both**: + - `mtime > 3 minutes` ago, and + - file size is unchanged since the previous run. +2. **Blank-page strip** (mDMS#2): duplex scans through patch-T separators + leave a blank backside (the unprinted reverse of the separator sheet) + at the front of every subsequent document. PDF files are passed + through `strip_blank_pages.py` before promotion. Pages with no + embedded text AND >97% near-white pixels are dropped. + +## Layout on mDock + +``` +/home/m/mdms-mover/mover.sh # script, deployed copy +/home/m/mdms-mover/strip_blank_pages.py # blank-page detector +/home/m/.config/systemd/user/mdms-mover.service # oneshot service +/home/m/.config/systemd/user/mdms-mover.timer # OnUnitActiveSec=1min +/home/m/.local/state/mdms-mover/state.tsv # last-seen size per file +/home/m/.local/bin/uv # uv runner for the strip script +``` + +Runs as user `m` under user-systemd. mDock has `Linger=yes` for user +`m`, so the timer keeps firing across reboots and logout sessions. + +## Why systemd, not cron + +The original spec (otto#438) called for `/etc/cron.d/mdms-mover`. mDock +runs Ubuntu 24.04 server which ships with systemd-timers and no `cron` +package. Installing cron only to honour the spec wording would add a +package we don't otherwise need; a user-systemd timer is the canonical +Ubuntu 24.04 approach and gives better observability +(`systemctl --user status mdms-mover.timer`, `journalctl --user -u mdms-mover`). + +User-mode (not system-mode) keeps the entire install in `m`'s home — no +sudo at deploy or maintenance time, no `/var/lib/...` directories to +chown, the service can read/write the NFS mount because `m` owns it. + +## Configuration + +``` +| var | default | meaning | +|------------------------|-----------------------------------------------|----------------------------------------------------| +| MDMS_INBOX | /mnt/mdms/inbox | source — scanner SMB target | +| MDMS_TOPROCESS | /mnt/mdms/toprocess | destination — Paperless consume | +| MDMS_STATE | $HOME/.local/state/mdms-mover/state.tsv | per-file size memory | +| MDMS_MIN_AGE_MIN | 3 | minimum mtime age in minutes | +| MDMS_STRIP_BLANK | true | run blank-page strip on PDFs (set to "false" to disable) | +| MDMS_STRIP_SCRIPT | /strip_blank_pages.py | path override for the strip script | +| MDMS_BLANK_THRESHOLD | 0.97 | near-white pixel ratio to call a page blank (read by strip script) | +| MDMS_BLANK_NEAR_WHITE | 240 | grayscale cutoff (0-255) for "near white" pixels (read by strip script) | +| MDMS_BLANK_DPI | 50 | thumbnail render DPI (read by strip script) | +``` + +To override at runtime, drop into +`~/.config/systemd/user/mdms-mover.service.d/override.conf`: + +```ini +[Service] +Environment=MDMS_MIN_AGE_MIN=5 +Environment=MDMS_BLANK_THRESHOLD=0.99 +``` + +then `systemctl --user daemon-reload && systemctl --user restart mdms-mover.timer`. + +## Blank-page detection — what gets dropped + +A page is dropped iff BOTH: + +1. embedded text is empty / whitespace-only (image-only scans always + pass this — they have no embedded text), AND +2. the rendered thumbnail is ≥ `MDMS_BLANK_THRESHOLD` near-white pixels + (0.97 by default → >97% of pixels brighter than grayscale 240). + +The threshold is conservative on purpose: a false-negative (keeping a +blank page we should have dropped) is recoverable via Paperless's UI; a +false-positive (dropping a real page) silently loses data. If real +pages get dropped in practice, **raise** `MDMS_BLANK_THRESHOLD` toward +0.99 — that makes the strip step pickier and keeps more pages. + +Edge cases handled inside `strip_blank_pages.py`: + +- **1-page input:** strip is skipped entirely (single-page docs never + have separator-backside artefacts). +- **All pages would drop:** the script exits with code `2` and writes no + output. The mover keeps the file in the inbox and logs + `WARNING: appears all-blank, kept in inbox`. m can inspect via + `journalctl --user -u mdms-mover`. +- **strip_blank_pages.py errors out:** mover falls back to a plain `mv` + (unstripped) so a transient problem in the detector never blocks a + scan from reaching Paperless. + +The script is a uv-inline-deps single file (PyMuPDF for both rendering +and text extraction — one wheel, no `poppler-utils` apt install on +mdock). Mirrors the pattern from `infra/paperless/generate_separator.py`. + +## Deploy / sync + +The live files on mDock must match this directory byte-for-byte (md5, +same convention as `infra/samba-canon/`). + +```bash +ssh mdock 'mkdir -p ~/mdms-mover ~/.config/systemd/user ~/.local/state/mdms-mover ~/.local/bin' + +# uv binary (single static binary, user-space — no apt, no sudo) +rsync -av ~/.local/bin/uv mdock:/home/m/.local/bin/uv + +# mover + strip script +scp infra/mdms-mover/mover.sh mdock:/home/m/mdms-mover/mover.sh +scp infra/mdms-mover/strip_blank_pages.py mdock:/home/m/mdms-mover/strip_blank_pages.py +scp infra/mdms-mover/mdms-mover.service mdock:/home/m/.config/systemd/user/ +scp infra/mdms-mover/mdms-mover.timer mdock:/home/m/.config/systemd/user/ + +ssh mdock 'chmod +x ~/mdms-mover/mover.sh ~/mdms-mover/strip_blank_pages.py && \ + systemctl --user daemon-reload && \ + systemctl --user enable --now mdms-mover.timer' +``` + +The first time the strip script runs, `uv` downloads python + PyMuPDF +into `~/.cache/uv/` (~30 MB). Subsequent runs reuse the cache. + +## Verify + +```bash +ssh mdock 'systemctl --user list-timers mdms-mover.timer' +ssh mdock 'journalctl --user -u mdms-mover -n 20 --no-pager' +ssh mdock 'cat ~/.local/state/mdms-mover/state.tsv' +ssh mdock 'journalctl -t mdms-mover -n 20 --no-pager' +``` + +## Emergency disable + +Stop the timer entirely: + +```bash +ssh mdock 'systemctl --user stop mdms-mover.timer && \ + systemctl --user disable mdms-mover.timer' +``` + +Or just disable the strip step while keeping the stability gate: + +```bash +mkdir -p ~/.config/systemd/user/mdms-mover.service.d +cat > ~/.config/systemd/user/mdms-mover.service.d/override.conf </dev/null; then + logger -t mdms-mover "moved $name ($size bytes)" + fi + return + fi + + # Stage stripped output inside toprocess (same filesystem → atomic rename). + # Dotfile prefix so Paperless's consumer ignores the partial during write. + local tmpout="$TOPROCESS/.mdms-tmp.$$.$name" + local rc=0 + "$STRIP_SCRIPT" "$src" "$tmpout" || rc=$? + + case "$rc" in + 0) + mv -f "$tmpout" "$dest" && rm -f "$src" + logger -t mdms-mover "moved $name ($size bytes, strip ok)" + ;; + 2) + rm -f "$tmpout" + logger -t mdms-mover "WARNING: $name appears all-blank, kept in inbox" + ;; + *) + rm -f "$tmpout" + logger -t mdms-mover "strip failed for $name (rc=$rc), passing through unchanged" + if mv -n "$src" "$dest" 2>/dev/null; then + logger -t mdms-mover "moved $name ($size bytes, unstripped)" + fi + ;; + esac +} + +# Iterate top-level regular files older than MIN_AGE_MIN. +# Skip dotfiles (probe files, scanner temp markers like ._foo, our .mdms-tmp.*). +while IFS= read -r f; do + name=$(basename "$f") + case "$name" in + .*) continue ;; + esac + + if ! size=$(stat -c %s "$f" 2>/dev/null); then + continue + fi + + prev=$(awk -v n="$name" '$1==n {print $2; exit}' "$STATE") + printf '%s\t%s\n' "$name" "$size" >> "$NEW_STATE" + + if [[ -n "$prev" && "$size" == "$prev" ]]; then + promote "$f" "$name" "$size" + fi +done < <(find "$INBOX" -maxdepth 1 -type f -mmin "+$MIN_AGE_MIN") + +mv "$NEW_STATE" "$STATE" +trap - EXIT diff --git a/infra/mdms-mover/strip_blank_pages.py b/infra/mdms-mover/strip_blank_pages.py new file mode 100755 index 0000000..a0b1317 --- /dev/null +++ b/infra/mdms-mover/strip_blank_pages.py @@ -0,0 +1,122 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "pymupdf>=1.24", +# "Pillow>=10.0", +# ] +# /// +"""Strip blank pages from a PDF — used by mdms-mover before promoting to toprocess. + +Usage: + strip_blank_pages.py + +Exit codes: + 0 output.pdf written (either stripped or copied unchanged) + 2 all pages would be dropped — output NOT written, caller should keep + the original file in the inbox and log a warning + 1 error (input unreadable, write failed, etc.) + +A page counts as "blank" iff BOTH of: + * embedded text is empty / whitespace-only, AND + * rendered thumbnail is >= MDMS_BLANK_THRESHOLD near-white pixels. + +False-negatives are preferred over false-positives — borderline pages stay. + +Env: + MDMS_BLANK_THRESHOLD near-white pixel ratio (0.0-1.0, default 0.97) + MDMS_BLANK_NEAR_WHITE near-white cutoff in 0-255 grayscale (default 240) + MDMS_BLANK_DPI thumbnail render DPI (default 50) + +PyMuPDF is used instead of pdf2image+pikepdf+pypdf so the whole pipeline is +one self-contained wheel — no poppler-utils apt-install on mdock, no +multiple text-extraction libraries to keep in sync. +""" +from __future__ import annotations + +import io +import os +import shutil +import sys +from pathlib import Path + +import fitz # PyMuPDF +from PIL import Image + + +def near_white_ratio(image: Image.Image, near_white: int) -> float: + gray = image.convert("L") if image.mode != "L" else image + hist = gray.histogram() + total = sum(hist) + if total == 0: + return 1.0 + return sum(hist[near_white:]) / total + + +def page_is_blank(page: "fitz.Page", threshold: float, near_white: int, dpi: int) -> bool: + text = (page.get_text("text") or "").strip() + if text: + return False + pix = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY) + image = Image.frombytes("L", (pix.width, pix.height), pix.samples) + return near_white_ratio(image, near_white) >= threshold + + +def main() -> int: + if len(sys.argv) != 3: + print(f"usage: {sys.argv[0]} ", file=sys.stderr) + return 1 + + src = Path(sys.argv[1]) + dst = Path(sys.argv[2]) + + threshold = float(os.environ.get("MDMS_BLANK_THRESHOLD", "0.97")) + near_white = int(os.environ.get("MDMS_BLANK_NEAR_WHITE", "240")) + dpi = int(os.environ.get("MDMS_BLANK_DPI", "50")) + + try: + doc = fitz.open(src) + except Exception as exc: + print(f"failed to open {src}: {exc}", file=sys.stderr) + return 1 + + try: + page_count = doc.page_count + + if page_count <= 1: + shutil.copyfile(src, dst) + return 0 + + keep: list[int] = [] + for i in range(page_count): + if not page_is_blank(doc[i], threshold, near_white, dpi): + keep.append(i) + + if not keep: + print(f"all pages blank in {src.name}", file=sys.stderr) + return 2 + + if len(keep) == page_count: + shutil.copyfile(src, dst) + return 0 + + out = fitz.open() + try: + for i in keep: + out.insert_pdf(doc, from_page=i, to_page=i) + out.save(dst) + finally: + out.close() + + dropped = page_count - len(keep) + print( + f"{src.name}: dropped {dropped}/{page_count} blank page(s)", + file=sys.stderr, + ) + return 0 + finally: + doc.close() + + +if __name__ == "__main__": + sys.exit(main())