fix: add DOMMatrix/Path2D/ImageData polyfills for pdfjs-dist in Node.js (AIIA-74)
All checks were successful
Deploy to VPS / deploy (push) Successful in 48s
All checks were successful
Deploy to VPS / deploy (push) Successful in 48s
pdfjs-dist v5 requires DOMMatrix even in legacy build. Add minimal polyfills so PDF text extraction works in the Node.js Docker container without @napi-rs/canvas. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -1,11 +1,53 @@
|
||||
// PDF text extraction using pdfjs-dist legacy build (Node.js compatible, no canvas/DOMMatrix)
|
||||
// PDF text extraction using pdfjs-dist legacy build
|
||||
// Polyfill DOMMatrix/Path2D/ImageData for Node.js where they are unavailable
|
||||
|
||||
if (typeof globalThis.DOMMatrix === 'undefined') {
|
||||
// Minimal DOMMatrix polyfill — sufficient for pdfjs text extraction (no rendering)
|
||||
globalThis.DOMMatrix = class DOMMatrix {
|
||||
a: number; b: number; c: number; d: number; e: number; f: number;
|
||||
m11: number; m12: number; m13: number; m14: number;
|
||||
m21: number; m22: number; m23: number; m24: number;
|
||||
m31: number; m32: number; m33: number; m34: number;
|
||||
m41: number; m42: number; m43: number; m44: number;
|
||||
is2D: boolean; isIdentity: boolean;
|
||||
constructor(init?: number[] | string) {
|
||||
const v = Array.isArray(init) ? init : [1, 0, 0, 1, 0, 0];
|
||||
[this.a, this.b, this.c, this.d, this.e, this.f] =
|
||||
v.length === 16
|
||||
? [v[0], v[1], v[4], v[5], v[12], v[13]]
|
||||
: [v[0] ?? 1, v[1] ?? 0, v[2] ?? 0, v[3] ?? 1, v[4] ?? 0, v[5] ?? 0];
|
||||
this.m11 = this.a; this.m12 = this.b; this.m13 = 0; this.m14 = 0;
|
||||
this.m21 = this.c; this.m22 = this.d; this.m23 = 0; this.m24 = 0;
|
||||
this.m31 = 0; this.m32 = 0; this.m33 = 1; this.m34 = 0;
|
||||
this.m41 = this.e; this.m42 = this.f; this.m43 = 0; this.m44 = 1;
|
||||
this.is2D = true; this.isIdentity = this.a === 1 && this.b === 0 && this.c === 0 && this.d === 1 && this.e === 0 && this.f === 0;
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
static fromMatrix(other: any) { return new DOMMatrix([other.a, other.b, other.c, other.d, other.e, other.f]); }
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
static fromFloat64Array(arr: any) { return new DOMMatrix(Array.from(arr)); }
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
static fromFloat32Array(arr: any) { return new DOMMatrix(Array.from(arr)); }
|
||||
} as unknown as typeof DOMMatrix;
|
||||
}
|
||||
|
||||
if (typeof globalThis.Path2D === 'undefined') {
|
||||
globalThis.Path2D = class Path2D { constructor() {} } as unknown as typeof Path2D;
|
||||
}
|
||||
|
||||
if (typeof globalThis.ImageData === 'undefined') {
|
||||
globalThis.ImageData = class ImageData {
|
||||
width: number; height: number; data: Uint8ClampedArray;
|
||||
constructor(w: number, h: number) { this.width = w; this.height = h; this.data = new Uint8ClampedArray(w * h * 4); }
|
||||
} as unknown as typeof ImageData;
|
||||
}
|
||||
|
||||
// Force Next.js file tracer to include the worker file in standalone builds
|
||||
import 'pdfjs-dist/legacy/build/pdf.worker.mjs';
|
||||
|
||||
/**
|
||||
* Extract all text from a PDF buffer.
|
||||
* Uses pdfjs-dist legacy build which works in Node.js without canvas or DOM APIs.
|
||||
* Uses pdfjs-dist legacy build with Node.js DOM polyfills for text extraction.
|
||||
*/
|
||||
export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
||||
|
||||
Reference in New Issue
Block a user