fix: migrate pdf-parse from v1 to v2 API to resolve DOMMatrix error
All checks were successful
Deploy to VPS / deploy (push) Successful in 37s

The old v1 API (`pdfParse(buffer)`) triggered DOMMatrix dependency via
pdfjs-dist canvas rendering path. The v2 API (`new PDFParse({ data })` +
`getText()`) uses a text-only code path that works in Node.js without
DOM/canvas polyfills.

Updated all three call sites:
- src/lib/documents/index.ts (generic document extraction)
- src/app/api/norms/parse/route.ts (norm PDF parsing)
- src/lib/contracts/index.ts (contract text extraction)
- src/types/pdf-parse.d.ts (updated type declarations for v2)

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
CTO
2026-04-10 20:36:26 +00:00
parent a89bf8380d
commit 1e431145dd
4 changed files with 25 additions and 15 deletions

View File

@@ -40,8 +40,10 @@ Antworte NUR mit einem JSON-Array. Kein erklaerener Text, kein Markdown, nur das
]`;
async function extractTextFromPdf(buffer: Buffer): Promise<string> {
const pdfParse = (await import('pdf-parse')).default;
const data = await pdfParse(buffer);
const { PDFParse } = await import('pdf-parse');
const parser = new PDFParse({ data: buffer });
const data = await parser.getText();
await parser.destroy();
return data.text;
}

View File

@@ -137,9 +137,10 @@ export async function extractDocumentText(tenantId: string, documentId: string):
let text: string;
if (doc.mimeType === 'application/pdf') {
// Dynamic import for pdf-parse (optional dependency)
const pdfParse = (await import('pdf-parse')).default;
const pdfData = await pdfParse(fileBuffer);
const { PDFParse } = await import('pdf-parse');
const parser = new PDFParse({ data: fileBuffer });
const pdfData = await parser.getText();
await parser.destroy();
text = pdfData.text;
} else {
// DOCX — use mammoth for extraction

View File

@@ -133,10 +133,12 @@ export async function extractDocumentText(tenantId: string, documentId: string):
let text: string;
if (doc.mimeType === 'application/pdf') {
const pdfParse = (await import('pdf-parse')).default;
const { PDFParse } = await import('pdf-parse');
let pdfData;
try {
pdfData = await pdfParse(fileBuffer);
const parser = new PDFParse({ data: fileBuffer });
pdfData = await parser.getText();
await parser.destroy();
} catch (pdfErr) {
const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr);
if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) {

View File

@@ -1,14 +1,19 @@
declare module 'pdf-parse' {
interface PdfData {
numpages: number;
numrender: number;
info: Record<string, unknown>;
metadata: Record<string, unknown>;
interface TextResult {
text: string;
version: string;
total: number;
pages: Array<{ page: number; text: string }>;
}
function pdfParse(dataBuffer: Buffer, options?: Record<string, unknown>): Promise<PdfData>;
interface PDFParseOptions {
data?: Buffer | ArrayBuffer | Uint8Array;
url?: string;
}
export default pdfParse;
export class PDFParse {
constructor(options: PDFParseOptions);
getText(options?: { partial?: number[] }): Promise<TextResult>;
getInfo(options?: { parsePageInfo?: boolean }): Promise<Record<string, unknown>>;
destroy(): Promise<void>;
}
}