anything-llm/server/utils/files/index.js

const fs = require("fs")
const path = require('path');
const { v5: uuidv5 } = require('uuid');

async function collectDocumentData(folderName = null) {
  if (!folderName) throw new Error('No docPath provided in request');
  const folder = path.resolve(__dirname, `../../documents/${folderName}`)
  const dirExists = fs.existsSync(folder);
  if (!dirExists) throw new Error(`No documents folder for ${folderName} - did you run collector/main.py for this element?`);

  const files = fs.readdirSync(folder);
  const fileData = [];
  files.forEach(file => {
    if (path.extname(file) === '.json') {
      const filePath = path.join(folder, file);
      const data = fs.readFileSync(filePath, 'utf8');
      console.log(`Parsing document: ${file}`);
      fileData.push(JSON.parse(data))
    }
  });
  return fileData;
}

// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
  if (!filePath) throw new Error('No docPath provided in request');
  const fullPath = path.resolve(__dirname, `../../documents/${filePath}`)
  const fileExists = fs.existsSync(fullPath);
  if (!fileExists) return null;

  const data = fs.readFileSync(fullPath, 'utf8');
  return JSON.parse(data)
}

async function viewLocalFiles() {
  const folder = path.resolve(__dirname, `../../documents`)
  const dirExists = fs.existsSync(folder);
  if (!dirExists) return {}

  const directory = {
    name: "documents",
    type: "folder",
    items: [],
  }

  for (const file of fs.readdirSync(folder)) {
    if (path.extname(file) === '.md') continue;
    const folderPath = path.resolve(__dirname, `../../documents/${file}`)
    const isFolder = fs.lstatSync(folderPath).isDirectory()
    if (isFolder) {
      const subdocs = {
        name: file,
        type: "folder",
        items: [],
      }
      const subfiles = fs.readdirSync(folderPath);

      for (const subfile of subfiles) {
        if (path.extname(subfile) !== '.json') continue;
        const filePath = path.join(folderPath, subfile);
        const rawData = fs.readFileSync(filePath, 'utf8');
        const cachefilename = `${file}/${subfile}`
        const { pageContent, ...metadata } = JSON.parse(rawData)

        subdocs.items.push({
          name: subfile,
          type: "file",
          ...metadata,
          cached: await cachedVectorInformation(cachefilename, true)
        })
      }
      directory.items.push(subdocs)
    }
  };

  return directory
}

// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation(filename = null, checkOnly = false) {
  if (!process.env.CACHE_VECTORS) return checkOnly ? false : { exists: false, chunks: [] };
  if (!filename) return checkOnly ? false : { exists: false, chunks: [] };

  const digest = uuidv5(filename, uuidv5.URL);
  const file = path.resolve(__dirname, `../../vector-cache/${digest}.json`);
  const exists = fs.existsSync(file);

  if (checkOnly) return exists
  if (!exists) return { exists, chunks: [] }

  console.log(`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`)
  const rawData = fs.readFileSync(file, 'utf8');
  return { exists: true, chunks: JSON.parse(rawData) }
}

// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult(vectorData = [], filename = null) {
  if (!process.env.CACHE_VECTORS) return;
  if (!filename) return;
  console.log(`Caching vectorized results of ${filename} to prevent duplicated embedding.`)
  const folder = path.resolve(__dirname, `../../vector-cache`);

  if (!fs.existsSync(folder)) fs.mkdirSync(folder);

  const digest = uuidv5(filename, uuidv5.URL);
  const writeTo = path.resolve(folder, `${digest}.json`);
  fs.writeFileSync(writeTo, JSON.stringify(vectorData), 'utf8');
  return;
}

module.exports = {
  cachedVectorInformation,
  collectDocumentData,
  viewLocalFiles,
  storeVectorResult,
  fileData
}