mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-15 19:00:33 +01:00
63 lines
1.4 KiB
JavaScript
63 lines
1.4 KiB
JavaScript
|
const fs = require("fs").promises;
|
||
|
const pdf = require("pdf-parse");
|
||
|
|
||
|
class PDFLoader {
|
||
|
constructor(filePath, { splitPages = true } = {}) {
|
||
|
this.filePath = filePath;
|
||
|
this.splitPages = splitPages;
|
||
|
}
|
||
|
|
||
|
async load() {
|
||
|
const buffer = await fs.readFile(this.filePath);
|
||
|
|
||
|
const options = {
|
||
|
pagerender: this.splitPages ? this.renderPage : null,
|
||
|
};
|
||
|
|
||
|
const { text, numpages, info, metadata, version } = await pdf(
|
||
|
buffer,
|
||
|
options
|
||
|
);
|
||
|
|
||
|
if (!this.splitPages) {
|
||
|
return [
|
||
|
{
|
||
|
pageContent: text.trim(),
|
||
|
metadata: {
|
||
|
source: this.filePath,
|
||
|
pdf: { version, info, metadata, totalPages: numpages },
|
||
|
},
|
||
|
},
|
||
|
];
|
||
|
}
|
||
|
|
||
|
return this.pages.map((pageContent, index) => ({
|
||
|
pageContent: pageContent.trim(),
|
||
|
metadata: {
|
||
|
source: this.filePath,
|
||
|
pdf: { version, info, metadata, totalPages: numpages },
|
||
|
loc: { pageNumber: index + 1 },
|
||
|
},
|
||
|
}));
|
||
|
}
|
||
|
|
||
|
pages = [];
|
||
|
|
||
|
renderPage = async (pageData) => {
|
||
|
const textContent = await pageData.getTextContent();
|
||
|
let lastY,
|
||
|
text = "";
|
||
|
for (const item of textContent.items) {
|
||
|
if (lastY !== item.transform[5] && lastY !== undefined) {
|
||
|
text += "\n";
|
||
|
}
|
||
|
text += item.str;
|
||
|
lastY = item.transform[5];
|
||
|
}
|
||
|
this.pages.push(text);
|
||
|
return text;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
module.exports = PDFLoader;
|