From 9b86bbd2b8d805922dac8e4afec527541e17b446 Mon Sep 17 00:00:00 2001 From: Sean Hatfield Date: Tue, 16 Jul 2024 13:09:43 -0700 Subject: [PATCH] [FIX] PDFLoader module bug fix (#1879) use pdf.js by importing it from pdf-parse and fix custom PDFLoader module --- .../convert/asPDF/PDFLoader/index.js | 115 ++++++++++++------ 1 file changed, 75 insertions(+), 40 deletions(-) diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js index 69876906..26bcf2b1 100644 --- a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js +++ b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js @@ -1,5 +1,4 @@ const fs = require("fs").promises; -const pdf = require("pdf-parse"); class PDFLoader { constructor(filePath, { splitPages = true } = {}) { @@ -9,54 +8,90 @@ class PDFLoader { async load() { const buffer = await fs.readFile(this.filePath); + const { getDocument, version } = await this.getPdfJS(); - const options = { - pagerender: this.splitPages ? this.renderPage : null, - }; + const pdf = await getDocument({ + data: new Uint8Array(buffer), + useWorkerFetch: false, + isEvalSupported: false, + useSystemFonts: true, + }).promise; - const { text, numpages, info, metadata, version } = await pdf( - buffer, - options - ); + const meta = await pdf.getMetadata().catch(() => null); + const documents = []; - if (!this.splitPages) { - return [ - { - pageContent: text.trim(), - metadata: { - source: this.filePath, - pdf: { version, info, metadata, totalPages: numpages }, + for (let i = 1; i <= pdf.numPages; i += 1) { + const page = await pdf.getPage(i); + const content = await page.getTextContent(); + + if (content.items.length === 0) { + continue; + } + + let lastY; + const textItems = []; + for (const item of content.items) { + if ("str" in item) { + if (lastY === item.transform[5] || !lastY) { + textItems.push(item.str); + } else { + textItems.push(`\n${item.str}`); + } + lastY = item.transform[5]; + } + } + + const text = textItems.join(""); + documents.push({ + pageContent: text.trim(), + metadata: { + source: this.filePath, + pdf: { + version, + info: meta?.info, + metadata: meta?.metadata, + totalPages: pdf.numPages, + }, + loc: { pageNumber: i }, + }, + }); + } + + if (this.splitPages) { + return documents; + } + + if (documents.length === 0) { + return []; + } + + return [ + { + pageContent: documents.map((doc) => doc.pageContent).join("\n\n"), + metadata: { + source: this.filePath, + pdf: { + version, + info: meta?.info, + metadata: meta?.metadata, + totalPages: pdf.numPages, }, }, - ]; - } - - return this.pages.map((pageContent, index) => ({ - pageContent: pageContent.trim(), - metadata: { - source: this.filePath, - pdf: { version, info, metadata, totalPages: numpages }, - loc: { pageNumber: index + 1 }, }, - })); + ]; } - pages = []; - - renderPage = async (pageData) => { - const textContent = await pageData.getTextContent(); - let lastY, - text = ""; - for (const item of textContent.items) { - if (lastY !== item.transform[5] && lastY !== undefined) { - text += "\n"; - } - text += item.str; - lastY = item.transform[5]; + async getPdfJS() { + try { + const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js"); + return { getDocument: pdfjs.getDocument, version: pdfjs.version }; + } catch (e) { + console.error(e); + throw new Error( + "Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`." + ); } - this.pages.push(text); - return text; - }; + } } module.exports = PDFLoader;