Employ strict validations on document pathing (#627)

* Employ strict validations on document pathing

* add comment

* update validSubfolder var
This commit is contained in:
Timothy Carambat 2024-01-19 12:56:00 -08:00 committed by GitHub
parent 0db6c3b2aa
commit 8a7324d0e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 96 additions and 88 deletions

View File

@ -1,37 +1,29 @@
const fs = require("fs");
const path = require("path");
const { v5: uuidv5 } = require("uuid");
const documentsPath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
const vectorCachePath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/vector-cache`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache`);
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
if (!filePath) throw new Error("No docPath provided in request");
const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
return null;
const fullPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../storage/documents/${normalizePath(filePath)}`
)
: path.resolve(
process.env.STORAGE_DIR,
`documents/${normalizePath(filePath)}`
);
const fileExists = fs.existsSync(fullPath);
if (!fileExists) return null;
const data = fs.readFileSync(fullPath, "utf8");
const data = fs.readFileSync(fullFilePath, "utf8");
return JSON.parse(data);
}
async function viewLocalFiles() {
const folder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
const dirExists = fs.existsSync(folder);
if (!dirExists) fs.mkdirSync(folder);
if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
const directory = {
name: "documents",
@ -39,14 +31,9 @@ async function viewLocalFiles() {
items: [],
};
for (const file of fs.readdirSync(folder)) {
for (const file of fs.readdirSync(documentsPath)) {
if (path.extname(file) === ".md") continue;
const folderPath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/documents/${file}`)
: path.resolve(process.env.STORAGE_DIR, `documents/${file}`);
const folderPath = path.resolve(documentsPath, file);
const isFolder = fs.lstatSync(folderPath).isDirectory();
if (isFolder) {
const subdocs = {
@ -83,10 +70,7 @@ async function cachedVectorInformation(filename = null, checkOnly = false) {
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
const digest = uuidv5(filename, uuidv5.URL);
const file =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/vector-cache/${digest}.json`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache/${digest}.json`);
const file = path.resolve(vectorCachePath, `${digest}.json`);
const exists = fs.existsSync(file);
if (checkOnly) return exists;
@ -106,15 +90,10 @@ async function storeVectorResult(vectorData = [], filename = null) {
console.log(
`Caching vectorized results of ${filename} to prevent duplicated embedding.`
);
const folder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/vector-cache`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache`);
if (!fs.existsSync(folder)) fs.mkdirSync(folder);
if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
const digest = uuidv5(filename, uuidv5.URL);
const writeTo = path.resolve(folder, `${digest}.json`);
const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
return;
}
@ -122,21 +101,16 @@ async function storeVectorResult(vectorData = [], filename = null) {
// Purges a file from the documents/ folder.
async function purgeSourceDocument(filename = null) {
if (!filename) return;
console.log(`Purging source document of ${filename}.`);
const filePath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../storage/documents`,
normalizePath(filename)
)
: path.resolve(
process.env.STORAGE_DIR,
`documents`,
normalizePath(filename)
);
const filePath = path.resolve(documentsPath, normalizePath(filename));
if (!fs.existsSync(filePath)) return;
if (
!fs.existsSync(filePath) ||
!isWithin(documentsPath, filePath) ||
!fs.lstatSync(filePath).isFile()
)
return;
console.log(`Purging source document of ${filename}.`);
fs.rmSync(filePath);
return;
}
@ -144,15 +118,11 @@ async function purgeSourceDocument(filename = null) {
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache(filename = null) {
if (!filename) return;
console.log(`Purging vector-cache of ${filename}.`);
const digest = uuidv5(filename, uuidv5.URL);
const filePath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/vector-cache`, `${digest}.json`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache`, `${digest}.json`);
const filePath = path.resolve(vectorCachePath, `${digest}.json`);
if (!fs.existsSync(filePath)) return;
if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
console.log(`Purging vector-cache of ${filename}.`);
fs.rmSync(filePath);
return;
}
@ -161,24 +131,20 @@ async function purgeVectorCache(filename = null) {
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments(documentName = null) {
if (!documentName) return null;
const documentsFolder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
for (const folder of fs.readdirSync(documentsFolder)) {
for (const folder of fs.readdirSync(documentsPath)) {
const isFolder = fs
.lstatSync(path.join(documentsFolder, folder))
.lstatSync(path.join(documentsPath, folder))
.isDirectory();
if (!isFolder) continue;
const targetFilename = normalizePath(documentName);
const targetFileLocation = path.join(
documentsFolder,
folder,
targetFilename
);
if (!fs.existsSync(targetFileLocation)) continue;
const targetFileLocation = path.join(documentsPath, folder, targetFilename);
if (
!fs.existsSync(targetFileLocation) ||
!isWithin(documentsPath, targetFileLocation)
)
continue;
const fileData = fs.readFileSync(targetFileLocation, "utf8");
const cachefilename = `${folder}/${targetFilename}`;
@ -194,8 +160,25 @@ async function findDocumentInDocuments(documentName = null) {
return null;
}
/**
* Checks if a given path is within another path.
* @param {string} outer - The outer path (should be resolved).
* @param {string} inner - The inner path (should be resolved).
* @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
*/
function isWithin(outer, inner) {
if (outer === inner) return false;
const rel = path.relative(outer, inner);
return !rel.startsWith("../") && rel !== "..";
}
function normalizePath(filepath = "") {
return path.normalize(filepath).replace(/^(\.\.(\/|\\|$))+/, "");
const result = path
.normalize(filepath.trim())
.replace(/^(\.\.(\/|\\|$))+/, "")
.trim();
if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
return result;
}
module.exports = {
@ -207,4 +190,6 @@ module.exports = {
storeVectorResult,
fileData,
normalizePath,
isWithin,
documentsPath,
};

View File

@ -1,30 +1,53 @@
const fs = require("fs");
const path = require("path");
const { purgeVectorCache, purgeSourceDocument, normalizePath } = require(".");
const {
purgeVectorCache,
purgeSourceDocument,
normalizePath,
isWithin,
documentsPath,
} = require(".");
const { Document } = require("../../models/documents");
const { Workspace } = require("../../models/workspace");
async function purgeDocument(filename) {
async function purgeDocument(filename = null) {
if (!filename || !normalizePath(filename)) return;
await purgeVectorCache(filename);
await purgeSourceDocument(filename);
const workspaces = await Workspace.where();
for (const workspace of workspaces) {
await Document.removeDocuments(workspace, [filename]);
}
await purgeVectorCache(filename);
await purgeSourceDocument(filename);
return;
}
async function purgeFolder(folderName) {
if (folderName === "custom-documents") return;
const documentsFolder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
async function purgeFolder(folderName = null) {
if (!folderName) return;
const subFolder = normalizePath(folderName);
const subFolderPath = path.resolve(documentsPath, subFolder);
const validRemovableSubFolders = fs
.readdirSync(documentsPath)
.map((folder) => {
// Filter out any results which are not folders or
// are the protected custom-documents folder.
if (folder === "custom-documents") return null;
const subfolderPath = path.resolve(documentsPath, folder);
if (!fs.lstatSync(subfolderPath).isDirectory()) return null;
return folder;
})
.filter((subFolder) => !!subFolder);
if (
!validRemovableSubFolders.includes(subFolder) ||
!fs.existsSync(subFolderPath) ||
!isWithin(documentsPath, subFolderPath)
)
return;
const folderPath = path.resolve(documentsFolder, normalizePath(folderName));
const filenames = fs
.readdirSync(folderPath)
.map((file) => path.join(folderPath, file));
.readdirSync(subFolderPath)
.map((file) => path.join(subFolderPath, file));
const workspaces = await Workspace.where();
const purgePromises = [];
@ -47,7 +70,7 @@ async function purgeFolder(folderName) {
}
await Promise.all(purgePromises.flat().map((f) => f()));
fs.rmSync(folderPath, { recursive: true }); // Delete root document and source files.
fs.rmSync(subFolderPath, { recursive: true }); // Delete target document-folder and source files.
return;
}