fix: add DOMMatrix/Path2D/ImageData polyfills for pdfjs-dist in Node.js (AIIA-74)
All checks were successful
Deploy to VPS / deploy (push) Successful in 48s

pdfjs-dist v5 requires DOMMatrix even in legacy build. Add minimal
polyfills so PDF text extraction works in the Node.js Docker container
without @napi-rs/canvas.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
CTO
2026-04-10 21:41:20 +00:00
parent af219c38d8
commit 8dc71448d7

View File

@@ -1,11 +1,53 @@
// PDF text extraction using pdfjs-dist legacy build (Node.js compatible, no canvas/DOMMatrix)
// PDF text extraction using pdfjs-dist legacy build
// Polyfill DOMMatrix/Path2D/ImageData for Node.js where they are unavailable
if (typeof globalThis.DOMMatrix === 'undefined') {
// Minimal DOMMatrix polyfill — sufficient for pdfjs text extraction (no rendering)
globalThis.DOMMatrix = class DOMMatrix {
a: number; b: number; c: number; d: number; e: number; f: number;
m11: number; m12: number; m13: number; m14: number;
m21: number; m22: number; m23: number; m24: number;
m31: number; m32: number; m33: number; m34: number;
m41: number; m42: number; m43: number; m44: number;
is2D: boolean; isIdentity: boolean;
constructor(init?: number[] | string) {
const v = Array.isArray(init) ? init : [1, 0, 0, 1, 0, 0];
[this.a, this.b, this.c, this.d, this.e, this.f] =
v.length === 16
? [v[0], v[1], v[4], v[5], v[12], v[13]]
: [v[0] ?? 1, v[1] ?? 0, v[2] ?? 0, v[3] ?? 1, v[4] ?? 0, v[5] ?? 0];
this.m11 = this.a; this.m12 = this.b; this.m13 = 0; this.m14 = 0;
this.m21 = this.c; this.m22 = this.d; this.m23 = 0; this.m24 = 0;
this.m31 = 0; this.m32 = 0; this.m33 = 1; this.m34 = 0;
this.m41 = this.e; this.m42 = this.f; this.m43 = 0; this.m44 = 1;
this.is2D = true; this.isIdentity = this.a === 1 && this.b === 0 && this.c === 0 && this.d === 1 && this.e === 0 && this.f === 0;
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
static fromMatrix(other: any) { return new DOMMatrix([other.a, other.b, other.c, other.d, other.e, other.f]); }
// eslint-disable-next-line @typescript-eslint/no-explicit-any
static fromFloat64Array(arr: any) { return new DOMMatrix(Array.from(arr)); }
// eslint-disable-next-line @typescript-eslint/no-explicit-any
static fromFloat32Array(arr: any) { return new DOMMatrix(Array.from(arr)); }
} as unknown as typeof DOMMatrix;
}
if (typeof globalThis.Path2D === 'undefined') {
globalThis.Path2D = class Path2D { constructor() {} } as unknown as typeof Path2D;
}
if (typeof globalThis.ImageData === 'undefined') {
globalThis.ImageData = class ImageData {
width: number; height: number; data: Uint8ClampedArray;
constructor(w: number, h: number) { this.width = w; this.height = h; this.data = new Uint8ClampedArray(w * h * 4); }
} as unknown as typeof ImageData;
}
// Force Next.js file tracer to include the worker file in standalone builds
import 'pdfjs-dist/legacy/build/pdf.worker.mjs';
/**
* Extract all text from a PDF buffer.
* Uses pdfjs-dist legacy build which works in Node.js without canvas or DOM APIs.
* Uses pdfjs-dist legacy build with Node.js DOM polyfills for text extraction.
*/
export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');