diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js new file mode 100644 index 00000000..827e3c3a --- /dev/null +++ b/collector/processSingleFile/convert/asEPub.js @@ -0,0 +1,55 @@ +const { v4 } = require("uuid"); +const { EPubLoader } = require("langchain/document_loaders/fs/epub"); +const { tokenizeString } = require("../../utils/tokenizer"); +const { + createdDate, + trashFile, + writeToServerDocuments, +} = require("../../utils/files"); +const { default: slugify } = require("slugify"); + +async function asEPub({ fullFilePath = "", filename = "" }) { + let content = ""; + try { + const loader = new EPubLoader(fullFilePath, { splitChapters: false }); + const docs = await loader.load(); + docs.forEach((doc) => (content += doc.pageContent)); + } catch (err) { + console.error("Could not read epub file!", err); + } + + if (!content?.length) { + console.error(`Resulting text content was empty for ${filename}.`); + trashFile(fullFilePath); + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; + } + + console.log(`-- Working ${filename} --`); + const data = { + id: v4(), + url: "file://" + fullFilePath, + title: filename, + docAuthor: "Unknown", // TODO: Find a better author + description: "Unknown", // TODO: Find a better description + docSource: "a epub file uploaded by the user.", + chunkSource: "", + published: createdDate(fullFilePath), + wordCount: content.split(" ").length, + pageContent: content, + token_count_estimate: tokenizeString(content).length, + }; + + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); + trashFile(fullFilePath); + console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); + return { success: true, reason: null, documents: [document] }; +} + +module.exports = asEPub;