anything-llm/server/utils/files/index.js

151 lines
4.9 KiB
JavaScript
Raw Normal View History

2023-06-08 06:31:35 +02:00
const fs = require("fs");
const path = require("path");
const { v5: uuidv5 } = require("uuid");
2023-06-04 04:28:07 +02:00
async function collectDocumentData(folderName = null) {
2023-06-08 06:31:35 +02:00
if (!folderName) throw new Error("No docPath provided in request");
const folder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../documents/${folderName}`)
: path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);
2023-06-04 04:28:07 +02:00
const dirExists = fs.existsSync(folder);
2023-06-08 06:31:35 +02:00
if (!dirExists)
throw new Error(
`No documents folder for ${folderName} - did you run collector/main.py for this element?`
);
2023-06-04 04:28:07 +02:00
const files = fs.readdirSync(folder);
const fileData = [];
2023-06-08 06:31:35 +02:00
files.forEach((file) => {
if (path.extname(file) === ".json") {
2023-06-04 04:28:07 +02:00
const filePath = path.join(folder, file);
2023-06-08 06:31:35 +02:00
const data = fs.readFileSync(filePath, "utf8");
2023-06-04 04:28:07 +02:00
console.log(`Parsing document: ${file}`);
2023-06-08 06:31:35 +02:00
fileData.push(JSON.parse(data));
2023-06-04 04:28:07 +02:00
}
});
return fileData;
}
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
2023-06-08 06:31:35 +02:00
if (!filePath) throw new Error("No docPath provided in request");
const fullPath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../documents/${filePath}`)
: path.resolve(process.env.STORAGE_DIR, `documents/${filePath}`);
2023-06-04 04:28:07 +02:00
const fileExists = fs.existsSync(fullPath);
if (!fileExists) return null;
2023-06-08 06:31:35 +02:00
const data = fs.readFileSync(fullPath, "utf8");
return JSON.parse(data);
2023-06-04 04:28:07 +02:00
}
async function viewLocalFiles() {
const folder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
2023-06-04 04:28:07 +02:00
const dirExists = fs.existsSync(folder);
if (!dirExists) fs.mkdirSync(folder);
2023-06-04 04:28:07 +02:00
const directory = {
name: "documents",
type: "folder",
items: [],
2023-06-08 06:31:35 +02:00
};
2023-06-04 04:28:07 +02:00
for (const file of fs.readdirSync(folder)) {
2023-06-08 06:31:35 +02:00
if (path.extname(file) === ".md") continue;
const folderPath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../documents/${file}`)
: path.resolve(process.env.STORAGE_DIR, `documents/${file}`);
2023-06-08 06:31:35 +02:00
const isFolder = fs.lstatSync(folderPath).isDirectory();
2023-06-04 04:28:07 +02:00
if (isFolder) {
const subdocs = {
name: file,
type: "folder",
items: [],
2023-06-08 06:31:35 +02:00
};
2023-06-04 04:28:07 +02:00
const subfiles = fs.readdirSync(folderPath);
for (const subfile of subfiles) {
2023-06-08 06:31:35 +02:00
if (path.extname(subfile) !== ".json") continue;
2023-06-04 04:28:07 +02:00
const filePath = path.join(folderPath, subfile);
2023-06-08 06:31:35 +02:00
const rawData = fs.readFileSync(filePath, "utf8");
const cachefilename = `${file}/${subfile}`;
const { pageContent, ...metadata } = JSON.parse(rawData);
2023-06-04 04:28:07 +02:00
subdocs.items.push({
name: subfile,
type: "file",
...metadata,
2023-06-08 06:31:35 +02:00
cached: await cachedVectorInformation(cachefilename, true),
});
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
directory.items.push(subdocs);
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
}
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
return directory;
2023-06-04 04:28:07 +02:00
}
// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation(filename = null, checkOnly = false) {
2023-06-08 06:31:35 +02:00
if (!process.env.CACHE_VECTORS)
return checkOnly ? false : { exists: false, chunks: [] };
2023-06-04 04:28:07 +02:00
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
const digest = uuidv5(filename, uuidv5.URL);
const file =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../vector-cache/${digest}.json`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache/${digest}.json`);
2023-06-04 04:28:07 +02:00
const exists = fs.existsSync(file);
2023-06-08 06:31:35 +02:00
if (checkOnly) return exists;
if (!exists) return { exists, chunks: [] };
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
console.log(
`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
);
const rawData = fs.readFileSync(file, "utf8");
return { exists: true, chunks: JSON.parse(rawData) };
2023-06-04 04:28:07 +02:00
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult(vectorData = [], filename = null) {
if (!process.env.CACHE_VECTORS) return;
if (!filename) return;
2023-06-08 06:31:35 +02:00
console.log(
`Caching vectorized results of ${filename} to prevent duplicated embedding.`
);
const folder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../vector-cache`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache`);
2023-06-04 04:28:07 +02:00
if (!fs.existsSync(folder)) fs.mkdirSync(folder);
const digest = uuidv5(filename, uuidv5.URL);
const writeTo = path.resolve(folder, `${digest}.json`);
2023-06-08 06:31:35 +02:00
fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
2023-06-04 04:28:07 +02:00
return;
}
module.exports = {
cachedVectorInformation,
collectDocumentData,
viewLocalFiles,
storeVectorResult,
2023-06-08 06:31:35 +02:00
fileData,
};