2024-07-11 21:26:11 +02:00
|
|
|
const fs = require("fs").promises;
|
|
|
|
|
|
|
|
class PDFLoader {
|
|
|
|
constructor(filePath, { splitPages = true } = {}) {
|
|
|
|
this.filePath = filePath;
|
|
|
|
this.splitPages = splitPages;
|
|
|
|
}
|
|
|
|
|
|
|
|
async load() {
|
|
|
|
const buffer = await fs.readFile(this.filePath);
|
2024-07-16 22:09:43 +02:00
|
|
|
const { getDocument, version } = await this.getPdfJS();
|
2024-07-11 21:26:11 +02:00
|
|
|
|
2024-07-16 22:09:43 +02:00
|
|
|
const pdf = await getDocument({
|
|
|
|
data: new Uint8Array(buffer),
|
|
|
|
useWorkerFetch: false,
|
|
|
|
isEvalSupported: false,
|
|
|
|
useSystemFonts: true,
|
|
|
|
}).promise;
|
|
|
|
|
|
|
|
const meta = await pdf.getMetadata().catch(() => null);
|
|
|
|
const documents = [];
|
|
|
|
|
|
|
|
for (let i = 1; i <= pdf.numPages; i += 1) {
|
|
|
|
const page = await pdf.getPage(i);
|
|
|
|
const content = await page.getTextContent();
|
|
|
|
|
|
|
|
if (content.items.length === 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
let lastY;
|
|
|
|
const textItems = [];
|
|
|
|
for (const item of content.items) {
|
|
|
|
if ("str" in item) {
|
|
|
|
if (lastY === item.transform[5] || !lastY) {
|
|
|
|
textItems.push(item.str);
|
|
|
|
} else {
|
|
|
|
textItems.push(`\n${item.str}`);
|
|
|
|
}
|
|
|
|
lastY = item.transform[5];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const text = textItems.join("");
|
|
|
|
documents.push({
|
|
|
|
pageContent: text.trim(),
|
|
|
|
metadata: {
|
|
|
|
source: this.filePath,
|
|
|
|
pdf: {
|
|
|
|
version,
|
|
|
|
info: meta?.info,
|
|
|
|
metadata: meta?.metadata,
|
|
|
|
totalPages: pdf.numPages,
|
2024-07-11 21:26:11 +02:00
|
|
|
},
|
2024-07-16 22:09:43 +02:00
|
|
|
loc: { pageNumber: i },
|
2024-07-11 21:26:11 +02:00
|
|
|
},
|
2024-07-16 22:09:43 +02:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.splitPages) {
|
|
|
|
return documents;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (documents.length === 0) {
|
|
|
|
return [];
|
2024-07-11 21:26:11 +02:00
|
|
|
}
|
|
|
|
|
2024-07-16 22:09:43 +02:00
|
|
|
return [
|
|
|
|
{
|
|
|
|
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
|
|
|
|
metadata: {
|
|
|
|
source: this.filePath,
|
|
|
|
pdf: {
|
|
|
|
version,
|
|
|
|
info: meta?.info,
|
|
|
|
metadata: meta?.metadata,
|
|
|
|
totalPages: pdf.numPages,
|
|
|
|
},
|
|
|
|
},
|
2024-07-11 21:26:11 +02:00
|
|
|
},
|
2024-07-16 22:09:43 +02:00
|
|
|
];
|
2024-07-11 21:26:11 +02:00
|
|
|
}
|
|
|
|
|
2024-07-16 22:09:43 +02:00
|
|
|
async getPdfJS() {
|
|
|
|
try {
|
|
|
|
const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
|
|
|
|
return { getDocument: pdfjs.getDocument, version: pdfjs.version };
|
|
|
|
} catch (e) {
|
|
|
|
console.error(e);
|
|
|
|
throw new Error(
|
|
|
|
"Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`."
|
|
|
|
);
|
2024-07-11 21:26:11 +02:00
|
|
|
}
|
2024-07-16 22:09:43 +02:00
|
|
|
}
|
2024-07-11 21:26:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = PDFLoader;
|