anything-llm/collector/processSingleFile/convert/asPDF/PDFLoader/index.js
Sean Hatfield 9b86bbd2b8
[FIX] PDFLoader module bug fix (#1879)
use pdf.js by importing it from pdf-parse and fix custom PDFLoader module
2024-07-16 13:09:43 -07:00

98 lines
2.3 KiB
JavaScript

const fs = require("fs").promises;
class PDFLoader {
constructor(filePath, { splitPages = true } = {}) {
this.filePath = filePath;
this.splitPages = splitPages;
}
async load() {
const buffer = await fs.readFile(this.filePath);
const { getDocument, version } = await this.getPdfJS();
const pdf = await getDocument({
data: new Uint8Array(buffer),
useWorkerFetch: false,
isEvalSupported: false,
useSystemFonts: true,
}).promise;
const meta = await pdf.getMetadata().catch(() => null);
const documents = [];
for (let i = 1; i <= pdf.numPages; i += 1) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();
if (content.items.length === 0) {
continue;
}
let lastY;
const textItems = [];
for (const item of content.items) {
if ("str" in item) {
if (lastY === item.transform[5] || !lastY) {
textItems.push(item.str);
} else {
textItems.push(`\n${item.str}`);
}
lastY = item.transform[5];
}
}
const text = textItems.join("");
documents.push({
pageContent: text.trim(),
metadata: {
source: this.filePath,
pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdf.numPages,
},
loc: { pageNumber: i },
},
});
}
if (this.splitPages) {
return documents;
}
if (documents.length === 0) {
return [];
}
return [
{
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
metadata: {
source: this.filePath,
pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdf.numPages,
},
},
},
];
}
async getPdfJS() {
try {
const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
return { getDocument: pdfjs.getDocument, version: pdfjs.version };
} catch (e) {
console.error(e);
throw new Error(
"Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`."
);
}
}
}
module.exports = PDFLoader;