fix: migrate pdf-parse from v1 to v2 API to resolve DOMMatrix error
All checks were successful
Deploy to VPS / deploy (push) Successful in 37s
All checks were successful
Deploy to VPS / deploy (push) Successful in 37s
The old v1 API (`pdfParse(buffer)`) triggered DOMMatrix dependency via
pdfjs-dist canvas rendering path. The v2 API (`new PDFParse({ data })` +
`getText()`) uses a text-only code path that works in Node.js without
DOM/canvas polyfills.
Updated all three call sites:
- src/lib/documents/index.ts (generic document extraction)
- src/app/api/norms/parse/route.ts (norm PDF parsing)
- src/lib/contracts/index.ts (contract text extraction)
- src/types/pdf-parse.d.ts (updated type declarations for v2)
Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -40,8 +40,10 @@ Antworte NUR mit einem JSON-Array. Kein erklaerener Text, kein Markdown, nur das
|
||||
]`;
|
||||
|
||||
async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||
const pdfParse = (await import('pdf-parse')).default;
|
||||
const data = await pdfParse(buffer);
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
const parser = new PDFParse({ data: buffer });
|
||||
const data = await parser.getText();
|
||||
await parser.destroy();
|
||||
return data.text;
|
||||
}
|
||||
|
||||
|
||||
@@ -137,9 +137,10 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
let text: string;
|
||||
|
||||
if (doc.mimeType === 'application/pdf') {
|
||||
// Dynamic import for pdf-parse (optional dependency)
|
||||
const pdfParse = (await import('pdf-parse')).default;
|
||||
const pdfData = await pdfParse(fileBuffer);
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
const parser = new PDFParse({ data: fileBuffer });
|
||||
const pdfData = await parser.getText();
|
||||
await parser.destroy();
|
||||
text = pdfData.text;
|
||||
} else {
|
||||
// DOCX — use mammoth for extraction
|
||||
|
||||
@@ -133,10 +133,12 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
let text: string;
|
||||
|
||||
if (doc.mimeType === 'application/pdf') {
|
||||
const pdfParse = (await import('pdf-parse')).default;
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
let pdfData;
|
||||
try {
|
||||
pdfData = await pdfParse(fileBuffer);
|
||||
const parser = new PDFParse({ data: fileBuffer });
|
||||
pdfData = await parser.getText();
|
||||
await parser.destroy();
|
||||
} catch (pdfErr) {
|
||||
const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr);
|
||||
if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) {
|
||||
|
||||
21
src/types/pdf-parse.d.ts
vendored
21
src/types/pdf-parse.d.ts
vendored
@@ -1,14 +1,19 @@
|
||||
declare module 'pdf-parse' {
|
||||
interface PdfData {
|
||||
numpages: number;
|
||||
numrender: number;
|
||||
info: Record<string, unknown>;
|
||||
metadata: Record<string, unknown>;
|
||||
interface TextResult {
|
||||
text: string;
|
||||
version: string;
|
||||
total: number;
|
||||
pages: Array<{ page: number; text: string }>;
|
||||
}
|
||||
|
||||
function pdfParse(dataBuffer: Buffer, options?: Record<string, unknown>): Promise<PdfData>;
|
||||
interface PDFParseOptions {
|
||||
data?: Buffer | ArrayBuffer | Uint8Array;
|
||||
url?: string;
|
||||
}
|
||||
|
||||
export default pdfParse;
|
||||
export class PDFParse {
|
||||
constructor(options: PDFParseOptions);
|
||||
getText(options?: { partial?: number[] }): Promise<TextResult>;
|
||||
getInfo(options?: { parsePageInfo?: boolean }): Promise<Record<string, unknown>>;
|
||||
destroy(): Promise<void>;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user