2023-06-07 21:31:35 -07:00

129 lines
4.2 KiB

const fs = require("fs");
const path = require("path");
const { v5: uuidv5 } = require("uuid");
async function collectDocumentData(folderName = null) {
if (!folderName) throw new Error("No docPath provided in request");
const folder = path.resolve(__dirname, `../../documents/${folderName}`);
const dirExists = fs.existsSync(folder);
if (!dirExists)
throw new Error(
`No documents folder for ${folderName} - did you run collector/ for this element?`
const files = fs.readdirSync(folder);
const fileData = [];
files.forEach((file) => {
if (path.extname(file) === ".json") {
const filePath = path.join(folder, file);
const data = fs.readFileSync(filePath, "utf8");
console.log(`Parsing document: ${file}`);
return fileData;
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
if (!filePath) throw new Error("No docPath provided in request");
const fullPath = path.resolve(__dirname, `../../documents/${filePath}`);
const fileExists = fs.existsSync(fullPath);
if (!fileExists) return null;
const data = fs.readFileSync(fullPath, "utf8");
return JSON.parse(data);
async function viewLocalFiles() {
const folder = path.resolve(__dirname, `../../documents`);
const dirExists = fs.existsSync(folder);
if (!dirExists) return {};
const directory = {
name: "documents",
type: "folder",
items: [],
for (const file of fs.readdirSync(folder)) {
if (path.extname(file) === ".md") continue;
const folderPath = path.resolve(__dirname, `../../documents/${file}`);
const isFolder = fs.lstatSync(folderPath).isDirectory();
if (isFolder) {
const subdocs = {
name: file,
type: "folder",
items: [],
const subfiles = fs.readdirSync(folderPath);
for (const subfile of subfiles) {
if (path.extname(subfile) !== ".json") continue;
const filePath = path.join(folderPath, subfile);
const rawData = fs.readFileSync(filePath, "utf8");
const cachefilename = `${file}/${subfile}`;
const { pageContent, ...metadata } = JSON.parse(rawData);
name: subfile,
type: "file",
cached: await cachedVectorInformation(cachefilename, true),
return directory;
// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation(filename = null, checkOnly = false) {
if (!process.env.CACHE_VECTORS)
return checkOnly ? false : { exists: false, chunks: [] };
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
const digest = uuidv5(filename, uuidv5.URL);
const file = path.resolve(__dirname, `../../vector-cache/${digest}.json`);
const exists = fs.existsSync(file);
if (checkOnly) return exists;
if (!exists) return { exists, chunks: [] };
`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
const rawData = fs.readFileSync(file, "utf8");
return { exists: true, chunks: JSON.parse(rawData) };
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult(vectorData = [], filename = null) {
if (!process.env.CACHE_VECTORS) return;
if (!filename) return;
`Caching vectorized results of ${filename} to prevent duplicated embedding.`
const folder = path.resolve(__dirname, `../../vector-cache`);
if (!fs.existsSync(folder)) fs.mkdirSync(folder);
const digest = uuidv5(filename, uuidv5.URL);
const writeTo = path.resolve(folder, `${digest}.json`);
fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
module.exports = {