mDMS/infra/mdms-mover/strip_blank_pages.py

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "pymupdf>=1.24",
#   "Pillow>=10.0",
# ]
# ///
"""Strip blank pages from a PDF — used by mdms-mover before promoting to toprocess.

Usage:
    strip_blank_pages.py <input.pdf> <output.pdf>

Exit codes:
    0   output.pdf written (either stripped or copied unchanged)
    2   all pages would be dropped — output NOT written, caller should keep
        the original file in the inbox and log a warning
    1   error (input unreadable, write failed, etc.)

A page counts as "blank" iff BOTH of:
  * embedded text is empty / whitespace-only, AND
  * rendered thumbnail is >= MDMS_BLANK_THRESHOLD near-white pixels.

False-negatives are preferred over false-positives — borderline pages stay.

Env:
  MDMS_BLANK_THRESHOLD   near-white pixel ratio (0.0-1.0, default 0.97)
  MDMS_BLANK_NEAR_WHITE  near-white cutoff in 0-255 grayscale (default 240)
  MDMS_BLANK_DPI         thumbnail render DPI (default 50)

PyMuPDF is used instead of pdf2image+pikepdf+pypdf so the whole pipeline is
one self-contained wheel — no poppler-utils apt-install on mdock, no
multiple text-extraction libraries to keep in sync.
"""
from __future__ import annotations

import io
import os
import shutil
import sys
from pathlib import Path

import fitz  # PyMuPDF
from PIL import Image


def near_white_ratio(image: Image.Image, near_white: int) -> float:
    gray = image.convert("L") if image.mode != "L" else image
    hist = gray.histogram()
    total = sum(hist)
    if total == 0:
        return 1.0
    return sum(hist[near_white:]) / total


def page_is_blank(page: "fitz.Page", threshold: float, near_white: int, dpi: int) -> bool:
    text = (page.get_text("text") or "").strip()
    if text:
        return False
    pix = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY)
    image = Image.frombytes("L", (pix.width, pix.height), pix.samples)
    return near_white_ratio(image, near_white) >= threshold


def main() -> int:
    if len(sys.argv) != 3:
        print(f"usage: {sys.argv[0]} <input.pdf> <output.pdf>", file=sys.stderr)
        return 1

    src = Path(sys.argv[1])
    dst = Path(sys.argv[2])

    threshold = float(os.environ.get("MDMS_BLANK_THRESHOLD", "0.97"))
    near_white = int(os.environ.get("MDMS_BLANK_NEAR_WHITE", "240"))
    dpi = int(os.environ.get("MDMS_BLANK_DPI", "50"))

    try:
        doc = fitz.open(src)
    except Exception as exc:
        print(f"failed to open {src}: {exc}", file=sys.stderr)
        return 1

    try:
        page_count = doc.page_count

        if page_count <= 1:
            shutil.copyfile(src, dst)
            return 0

        keep: list[int] = []
        for i in range(page_count):
            if not page_is_blank(doc[i], threshold, near_white, dpi):
                keep.append(i)

        if not keep:
            print(f"all pages blank in {src.name}", file=sys.stderr)
            return 2

        if len(keep) == page_count:
            shutil.copyfile(src, dst)
            return 0

        out = fitz.open()
        try:
            for i in keep:
                out.insert_pdf(doc, from_page=i, to_page=i)
            out.save(dst)
        finally:
            out.close()

        dropped = page_count - len(keep)
        print(
            f"{src.name}: dropped {dropped}/{page_count} blank page(s)",
            file=sys.stderr,
        )
        return 0
    finally:
        doc.close()


if __name__ == "__main__":
    sys.exit(main())