Two changes: 1. Migrate mover from m/otto (commit 9974937, otto#438) into this repo at infra/mdms-mover/. mover.sh, mdms-mover.service, mdms-mover.timer, README.md. Matches the live deployment on mDock byte-for-byte (modulo the strip step below). 2. Add blank-page stripping before the inbox → toprocess promotion. A page is dropped iff its embedded text is empty AND its rendered thumbnail is >= MDMS_BLANK_THRESHOLD near-white pixels (default 0.97 per issue #2). Detects the empty backside of patch-T separator sheets in duplex scans (mDMS#2). strip_blank_pages.py uses PyMuPDF as the only Python dep — single self-contained wheel, no `poppler-utils` apt-install on mdock. Mirrors the uv-inline-deps single-file pattern of infra/paperless/generate_separator.py. Edge cases: - 1-page input: strip skipped entirely. - All pages would drop: script exits 2, mover keeps file in inbox and logs WARNING (no empty doc reaches Paperless). - Strip script errors: mover falls back to plain mv, no scan blocked. - MDMS_STRIP_BLANK=false: bypass strip entirely (emergency disable). Deploy: rsync uv binary to mdock ~/.local/bin/uv (single static binary, user-space, no apt), scp script + units, systemctl --user daemon-reload. Verified live with synthetic 4-page (2 real + 1 blank + 1 real → 3 pages), 1-page (unchanged), all-blank (kept in inbox + warning) test PDFs. Timer fires every ~70s as before.
123 lines
3.4 KiB
Python
Executable File
123 lines
3.4 KiB
Python
Executable File
#!/usr/bin/env -S uv run --script
|
|
# /// script
|
|
# requires-python = ">=3.11"
|
|
# dependencies = [
|
|
# "pymupdf>=1.24",
|
|
# "Pillow>=10.0",
|
|
# ]
|
|
# ///
|
|
"""Strip blank pages from a PDF — used by mdms-mover before promoting to toprocess.
|
|
|
|
Usage:
|
|
strip_blank_pages.py <input.pdf> <output.pdf>
|
|
|
|
Exit codes:
|
|
0 output.pdf written (either stripped or copied unchanged)
|
|
2 all pages would be dropped — output NOT written, caller should keep
|
|
the original file in the inbox and log a warning
|
|
1 error (input unreadable, write failed, etc.)
|
|
|
|
A page counts as "blank" iff BOTH of:
|
|
* embedded text is empty / whitespace-only, AND
|
|
* rendered thumbnail is >= MDMS_BLANK_THRESHOLD near-white pixels.
|
|
|
|
False-negatives are preferred over false-positives — borderline pages stay.
|
|
|
|
Env:
|
|
MDMS_BLANK_THRESHOLD near-white pixel ratio (0.0-1.0, default 0.97)
|
|
MDMS_BLANK_NEAR_WHITE near-white cutoff in 0-255 grayscale (default 240)
|
|
MDMS_BLANK_DPI thumbnail render DPI (default 50)
|
|
|
|
PyMuPDF is used instead of pdf2image+pikepdf+pypdf so the whole pipeline is
|
|
one self-contained wheel — no poppler-utils apt-install on mdock, no
|
|
multiple text-extraction libraries to keep in sync.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import os
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
|
|
|
|
def near_white_ratio(image: Image.Image, near_white: int) -> float:
|
|
gray = image.convert("L") if image.mode != "L" else image
|
|
hist = gray.histogram()
|
|
total = sum(hist)
|
|
if total == 0:
|
|
return 1.0
|
|
return sum(hist[near_white:]) / total
|
|
|
|
|
|
def page_is_blank(page: "fitz.Page", threshold: float, near_white: int, dpi: int) -> bool:
|
|
text = (page.get_text("text") or "").strip()
|
|
if text:
|
|
return False
|
|
pix = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY)
|
|
image = Image.frombytes("L", (pix.width, pix.height), pix.samples)
|
|
return near_white_ratio(image, near_white) >= threshold
|
|
|
|
|
|
def main() -> int:
|
|
if len(sys.argv) != 3:
|
|
print(f"usage: {sys.argv[0]} <input.pdf> <output.pdf>", file=sys.stderr)
|
|
return 1
|
|
|
|
src = Path(sys.argv[1])
|
|
dst = Path(sys.argv[2])
|
|
|
|
threshold = float(os.environ.get("MDMS_BLANK_THRESHOLD", "0.97"))
|
|
near_white = int(os.environ.get("MDMS_BLANK_NEAR_WHITE", "240"))
|
|
dpi = int(os.environ.get("MDMS_BLANK_DPI", "50"))
|
|
|
|
try:
|
|
doc = fitz.open(src)
|
|
except Exception as exc:
|
|
print(f"failed to open {src}: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
try:
|
|
page_count = doc.page_count
|
|
|
|
if page_count <= 1:
|
|
shutil.copyfile(src, dst)
|
|
return 0
|
|
|
|
keep: list[int] = []
|
|
for i in range(page_count):
|
|
if not page_is_blank(doc[i], threshold, near_white, dpi):
|
|
keep.append(i)
|
|
|
|
if not keep:
|
|
print(f"all pages blank in {src.name}", file=sys.stderr)
|
|
return 2
|
|
|
|
if len(keep) == page_count:
|
|
shutil.copyfile(src, dst)
|
|
return 0
|
|
|
|
out = fitz.open()
|
|
try:
|
|
for i in keep:
|
|
out.insert_pdf(doc, from_page=i, to_page=i)
|
|
out.save(dst)
|
|
finally:
|
|
out.close()
|
|
|
|
dropped = page_count - len(keep)
|
|
print(
|
|
f"{src.name}: dropped {dropped}/{page_count} blank page(s)",
|
|
file=sys.stderr,
|
|
)
|
|
return 0
|
|
finally:
|
|
doc.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|