mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-15 02:50:10 +01:00
[FIX] PDFLoader module bug fix (#1879)
use pdf.js by importing it from pdf-parse and fix custom PDFLoader module
This commit is contained in:
parent
86a66ba569
commit
9b86bbd2b8
@ -1,5 +1,4 @@
|
||||
const fs = require("fs").promises;
|
||||
const pdf = require("pdf-parse");
|
||||
|
||||
class PDFLoader {
|
||||
constructor(filePath, { splitPages = true } = {}) {
|
||||
@ -9,54 +8,90 @@ class PDFLoader {
|
||||
|
||||
async load() {
|
||||
const buffer = await fs.readFile(this.filePath);
|
||||
const { getDocument, version } = await this.getPdfJS();
|
||||
|
||||
const options = {
|
||||
pagerender: this.splitPages ? this.renderPage : null,
|
||||
};
|
||||
const pdf = await getDocument({
|
||||
data: new Uint8Array(buffer),
|
||||
useWorkerFetch: false,
|
||||
isEvalSupported: false,
|
||||
useSystemFonts: true,
|
||||
}).promise;
|
||||
|
||||
const { text, numpages, info, metadata, version } = await pdf(
|
||||
buffer,
|
||||
options
|
||||
);
|
||||
const meta = await pdf.getMetadata().catch(() => null);
|
||||
const documents = [];
|
||||
|
||||
if (!this.splitPages) {
|
||||
return [
|
||||
{
|
||||
for (let i = 1; i <= pdf.numPages; i += 1) {
|
||||
const page = await pdf.getPage(i);
|
||||
const content = await page.getTextContent();
|
||||
|
||||
if (content.items.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let lastY;
|
||||
const textItems = [];
|
||||
for (const item of content.items) {
|
||||
if ("str" in item) {
|
||||
if (lastY === item.transform[5] || !lastY) {
|
||||
textItems.push(item.str);
|
||||
} else {
|
||||
textItems.push(`\n${item.str}`);
|
||||
}
|
||||
lastY = item.transform[5];
|
||||
}
|
||||
}
|
||||
|
||||
const text = textItems.join("");
|
||||
documents.push({
|
||||
pageContent: text.trim(),
|
||||
metadata: {
|
||||
source: this.filePath,
|
||||
pdf: { version, info, metadata, totalPages: numpages },
|
||||
pdf: {
|
||||
version,
|
||||
info: meta?.info,
|
||||
metadata: meta?.metadata,
|
||||
totalPages: pdf.numPages,
|
||||
},
|
||||
loc: { pageNumber: i },
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
if (this.splitPages) {
|
||||
return documents;
|
||||
}
|
||||
|
||||
if (documents.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
|
||||
metadata: {
|
||||
source: this.filePath,
|
||||
pdf: {
|
||||
version,
|
||||
info: meta?.info,
|
||||
metadata: meta?.metadata,
|
||||
totalPages: pdf.numPages,
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return this.pages.map((pageContent, index) => ({
|
||||
pageContent: pageContent.trim(),
|
||||
metadata: {
|
||||
source: this.filePath,
|
||||
pdf: { version, info, metadata, totalPages: numpages },
|
||||
loc: { pageNumber: index + 1 },
|
||||
},
|
||||
}));
|
||||
async getPdfJS() {
|
||||
try {
|
||||
const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
|
||||
return { getDocument: pdfjs.getDocument, version: pdfjs.version };
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
throw new Error(
|
||||
"Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`."
|
||||
);
|
||||
}
|
||||
|
||||
pages = [];
|
||||
|
||||
renderPage = async (pageData) => {
|
||||
const textContent = await pageData.getTextContent();
|
||||
let lastY,
|
||||
text = "";
|
||||
for (const item of textContent.items) {
|
||||
if (lastY !== item.transform[5] && lastY !== undefined) {
|
||||
text += "\n";
|
||||
}
|
||||
text += item.str;
|
||||
lastY = item.transform[5];
|
||||
}
|
||||
this.pages.push(text);
|
||||
return text;
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = PDFLoader;
|
||||
|
Loading…
Reference in New Issue
Block a user