From 1e431145dddcb09ecf0db30bc7843666d37a0a61 Mon Sep 17 00:00:00 2001 From: CTO Date: Fri, 10 Apr 2026 20:36:26 +0000 Subject: [PATCH] fix: migrate pdf-parse from v1 to v2 API to resolve DOMMatrix error The old v1 API (`pdfParse(buffer)`) triggered DOMMatrix dependency via pdfjs-dist canvas rendering path. The v2 API (`new PDFParse({ data })` + `getText()`) uses a text-only code path that works in Node.js without DOM/canvas polyfills. Updated all three call sites: - src/lib/documents/index.ts (generic document extraction) - src/app/api/norms/parse/route.ts (norm PDF parsing) - src/lib/contracts/index.ts (contract text extraction) - src/types/pdf-parse.d.ts (updated type declarations for v2) Co-Authored-By: Paperclip --- src/app/api/norms/parse/route.ts | 6 ++++-- src/lib/contracts/index.ts | 7 ++++--- src/lib/documents/index.ts | 6 ++++-- src/types/pdf-parse.d.ts | 21 +++++++++++++-------- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/app/api/norms/parse/route.ts b/src/app/api/norms/parse/route.ts index 48ffe3f..f53a539 100644 --- a/src/app/api/norms/parse/route.ts +++ b/src/app/api/norms/parse/route.ts @@ -40,8 +40,10 @@ Antworte NUR mit einem JSON-Array. Kein erklaerener Text, kein Markdown, nur das ]`; async function extractTextFromPdf(buffer: Buffer): Promise { - const pdfParse = (await import('pdf-parse')).default; - const data = await pdfParse(buffer); + const { PDFParse } = await import('pdf-parse'); + const parser = new PDFParse({ data: buffer }); + const data = await parser.getText(); + await parser.destroy(); return data.text; } diff --git a/src/lib/contracts/index.ts b/src/lib/contracts/index.ts index b17f339..2b075eb 100644 --- a/src/lib/contracts/index.ts +++ b/src/lib/contracts/index.ts @@ -137,9 +137,10 @@ export async function extractDocumentText(tenantId: string, documentId: string): let text: string; if (doc.mimeType === 'application/pdf') { - // Dynamic import for pdf-parse (optional dependency) - const pdfParse = (await import('pdf-parse')).default; - const pdfData = await pdfParse(fileBuffer); + const { PDFParse } = await import('pdf-parse'); + const parser = new PDFParse({ data: fileBuffer }); + const pdfData = await parser.getText(); + await parser.destroy(); text = pdfData.text; } else { // DOCX — use mammoth for extraction diff --git a/src/lib/documents/index.ts b/src/lib/documents/index.ts index 878e560..96bc0e3 100644 --- a/src/lib/documents/index.ts +++ b/src/lib/documents/index.ts @@ -133,10 +133,12 @@ export async function extractDocumentText(tenantId: string, documentId: string): let text: string; if (doc.mimeType === 'application/pdf') { - const pdfParse = (await import('pdf-parse')).default; + const { PDFParse } = await import('pdf-parse'); let pdfData; try { - pdfData = await pdfParse(fileBuffer); + const parser = new PDFParse({ data: fileBuffer }); + pdfData = await parser.getText(); + await parser.destroy(); } catch (pdfErr) { const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr); if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) { diff --git a/src/types/pdf-parse.d.ts b/src/types/pdf-parse.d.ts index 12e5dc7..12553c1 100644 --- a/src/types/pdf-parse.d.ts +++ b/src/types/pdf-parse.d.ts @@ -1,14 +1,19 @@ declare module 'pdf-parse' { - interface PdfData { - numpages: number; - numrender: number; - info: Record; - metadata: Record; + interface TextResult { text: string; - version: string; + total: number; + pages: Array<{ page: number; text: string }>; } - function pdfParse(dataBuffer: Buffer, options?: Record): Promise; + interface PDFParseOptions { + data?: Buffer | ArrayBuffer | Uint8Array; + url?: string; + } - export default pdfParse; + export class PDFParse { + constructor(options: PDFParseOptions); + getText(options?: { partial?: number[] }): Promise; + getInfo(options?: { parsePageInfo?: boolean }): Promise>; + destroy(): Promise; + } }