mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-15 19:00:33 +01:00
79656718b2
* implement custom PDFLoader to remove LC dep * remove unneeded comment * remove pdfjs as dep and fix page splitting using pdf-parse * linting + export rename for desktop compat --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
63 lines
1.4 KiB
JavaScript
63 lines
1.4 KiB
JavaScript
const fs = require("fs").promises;
|
|
const pdf = require("pdf-parse");
|
|
|
|
class PDFLoader {
|
|
constructor(filePath, { splitPages = true } = {}) {
|
|
this.filePath = filePath;
|
|
this.splitPages = splitPages;
|
|
}
|
|
|
|
async load() {
|
|
const buffer = await fs.readFile(this.filePath);
|
|
|
|
const options = {
|
|
pagerender: this.splitPages ? this.renderPage : null,
|
|
};
|
|
|
|
const { text, numpages, info, metadata, version } = await pdf(
|
|
buffer,
|
|
options
|
|
);
|
|
|
|
if (!this.splitPages) {
|
|
return [
|
|
{
|
|
pageContent: text.trim(),
|
|
metadata: {
|
|
source: this.filePath,
|
|
pdf: { version, info, metadata, totalPages: numpages },
|
|
},
|
|
},
|
|
];
|
|
}
|
|
|
|
return this.pages.map((pageContent, index) => ({
|
|
pageContent: pageContent.trim(),
|
|
metadata: {
|
|
source: this.filePath,
|
|
pdf: { version, info, metadata, totalPages: numpages },
|
|
loc: { pageNumber: index + 1 },
|
|
},
|
|
}));
|
|
}
|
|
|
|
pages = [];
|
|
|
|
renderPage = async (pageData) => {
|
|
const textContent = await pageData.getTextContent();
|
|
let lastY,
|
|
text = "";
|
|
for (const item of textContent.items) {
|
|
if (lastY !== item.transform[5] && lastY !== undefined) {
|
|
text += "\n";
|
|
}
|
|
text += item.str;
|
|
lastY = item.transform[5];
|
|
}
|
|
this.pages.push(text);
|
|
return text;
|
|
};
|
|
}
|
|
|
|
module.exports = PDFLoader;
|