anything-llm/server/utils/files/index.js
Timothy Carambat dc4ad6b5a9
[BETA] Live document sync (#1719)
* wip bg workers for live document sync

* Add ability to re-embed specific documents across many workspaces via background queue
bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled
UI for watching/unwatching docments that are embedded.
TODO: UI to easily manage all bg tasks and see run results
TODO: UI to enable this feature and background endpoints to manage it

* create frontend views and paths
Move elements to correct experimental scope

* update migration to delete runs on removal of watched document

* Add watch support to YouTube transcripts (#1716)

* Add watch support to YouTube transcripts
refactor how sync is done for supported types

* Watch specific files in Confluence space (#1718)

Add failure-prune check for runs

* create tmp workflow modifications for beta image

* create tmp workflow modifications for beta image

* create tmp workflow modifications for beta image

* dual build
update copy of alert modals

* update job interval

* Add support for live-sync of Github files

* update copy for document sync feature

* hide Experimental features from UI

* update docs links

* [FEAT] Implement new settings menu for experimental features (#1735)

* implement new settings menu for experimental features

* remove unused context save bar

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>

* dont run job on boot

* unset workflow changes

* Add persistent encryption service
Relay key to collector so persistent encryption can be used
Encrypt any private data in chunkSources used for replay during resync jobs

* update jsDOC

* Linting and organization

* update modal copy for feature

---------

Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2024-06-21 13:38:50 -07:00

234 lines
7.7 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const { v5: uuidv5 } = require("uuid");
const { Document } = require("../../models/documents");
const { DocumentSyncQueue } = require("../../models/documentSyncQueue");
const documentsPath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
const vectorCachePath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/vector-cache`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache`);
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
if (!filePath) throw new Error("No docPath provided in request");
const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
return null;
const data = fs.readFileSync(fullFilePath, "utf8");
return JSON.parse(data);
}
async function viewLocalFiles() {
if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
const liveSyncAvailable = await DocumentSyncQueue.enabled();
const directory = {
name: "documents",
type: "folder",
items: [],
};
for (const file of fs.readdirSync(documentsPath)) {
if (path.extname(file) === ".md") continue;
const folderPath = path.resolve(documentsPath, file);
const isFolder = fs.lstatSync(folderPath).isDirectory();
if (isFolder) {
const subdocs = {
name: file,
type: "folder",
items: [],
};
const subfiles = fs.readdirSync(folderPath);
for (const subfile of subfiles) {
if (path.extname(subfile) !== ".json") continue;
const filePath = path.join(folderPath, subfile);
const rawData = fs.readFileSync(filePath, "utf8");
const cachefilename = `${file}/${subfile}`;
const { pageContent, ...metadata } = JSON.parse(rawData);
const pinnedInWorkspaces = await Document.getOnlyWorkspaceIds({
docpath: cachefilename,
pinned: true,
});
const watchedInWorkspaces = liveSyncAvailable
? await Document.getOnlyWorkspaceIds({
docpath: cachefilename,
watched: true,
})
: [];
subdocs.items.push({
name: subfile,
type: "file",
...metadata,
cached: await cachedVectorInformation(cachefilename, true),
pinnedWorkspaces: pinnedInWorkspaces,
canWatch: liveSyncAvailable
? DocumentSyncQueue.canWatch(metadata)
: false,
// Is file watched in any workspace since sync updates all workspaces where file is referenced
watched: watchedInWorkspaces.length !== 0,
});
}
directory.items.push(subdocs);
}
}
// Make sure custom-documents is always the first folder in picker
directory.items = [
directory.items.find((folder) => folder.name === "custom-documents"),
...directory.items.filter((folder) => folder.name !== "custom-documents"),
].filter((i) => !!i);
return directory;
}
// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation(filename = null, checkOnly = false) {
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
const digest = uuidv5(filename, uuidv5.URL);
const file = path.resolve(vectorCachePath, `${digest}.json`);
const exists = fs.existsSync(file);
if (checkOnly) return exists;
if (!exists) return { exists, chunks: [] };
console.log(
`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
);
const rawData = fs.readFileSync(file, "utf8");
return { exists: true, chunks: JSON.parse(rawData) };
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult(vectorData = [], filename = null) {
if (!filename) return;
console.log(
`Caching vectorized results of ${filename} to prevent duplicated embedding.`
);
if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
const digest = uuidv5(filename, uuidv5.URL);
const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
return;
}
// Purges a file from the documents/ folder.
async function purgeSourceDocument(filename = null) {
if (!filename) return;
const filePath = path.resolve(documentsPath, normalizePath(filename));
if (
!fs.existsSync(filePath) ||
!isWithin(documentsPath, filePath) ||
!fs.lstatSync(filePath).isFile()
)
return;
console.log(`Purging source document of ${filename}.`);
fs.rmSync(filePath);
return;
}
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache(filename = null) {
if (!filename) return;
const digest = uuidv5(filename, uuidv5.URL);
const filePath = path.resolve(vectorCachePath, `${digest}.json`);
if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
console.log(`Purging vector-cache of ${filename}.`);
fs.rmSync(filePath);
return;
}
// Search for a specific document by its unique name in the entire `documents`
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments(documentName = null) {
if (!documentName) return null;
for (const folder of fs.readdirSync(documentsPath)) {
const isFolder = fs
.lstatSync(path.join(documentsPath, folder))
.isDirectory();
if (!isFolder) continue;
const targetFilename = normalizePath(documentName);
const targetFileLocation = path.join(documentsPath, folder, targetFilename);
if (
!fs.existsSync(targetFileLocation) ||
!isWithin(documentsPath, targetFileLocation)
)
continue;
const fileData = fs.readFileSync(targetFileLocation, "utf8");
const cachefilename = `${folder}/${targetFilename}`;
const { pageContent, ...metadata } = JSON.parse(fileData);
return {
name: targetFilename,
type: "file",
...metadata,
cached: await cachedVectorInformation(cachefilename, true),
};
}
return null;
}
/**
* Checks if a given path is within another path.
* @param {string} outer - The outer path (should be resolved).
* @param {string} inner - The inner path (should be resolved).
* @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
*/
function isWithin(outer, inner) {
if (outer === inner) return false;
const rel = path.relative(outer, inner);
return !rel.startsWith("../") && rel !== "..";
}
function normalizePath(filepath = "") {
const result = path
.normalize(filepath.trim())
.replace(/^(\.\.(\/|\\|$))+/, "")
.trim();
if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
return result;
}
// Check if the vector-cache folder is empty or not
// useful for it the user is changing embedders as this will
// break the previous cache.
function hasVectorCachedFiles() {
try {
return (
fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))
.length !== 0
);
} catch {}
return false;
}
module.exports = {
findDocumentInDocuments,
cachedVectorInformation,
viewLocalFiles,
purgeSourceDocument,
purgeVectorCache,
storeVectorResult,
fileData,
normalizePath,
isWithin,
documentsPath,
hasVectorCachedFiles,
};