2023-06-08 06:31:35 +02:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const { v5 : uuidv5 } = require ( "uuid" ) ;
2024-01-19 21:56:00 +01:00
const documentsPath =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/documents ` )
: path . resolve ( process . env . STORAGE _DIR , ` documents ` ) ;
const vectorCachePath =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/vector-cache ` )
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache ` ) ;
2023-06-04 04:28:07 +02:00
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData ( filePath = null ) {
2023-06-08 06:31:35 +02:00
if ( ! filePath ) throw new Error ( "No docPath provided in request" ) ;
2024-01-19 21:56:00 +01:00
const fullFilePath = path . resolve ( documentsPath , normalizePath ( filePath ) ) ;
if ( ! fs . existsSync ( fullFilePath ) || ! isWithin ( documentsPath , fullFilePath ) )
return null ;
2023-06-08 23:00:43 +02:00
2024-01-19 21:56:00 +01:00
const data = fs . readFileSync ( fullFilePath , "utf8" ) ;
2023-06-08 06:31:35 +02:00
return JSON . parse ( data ) ;
2023-06-04 04:28:07 +02:00
}
async function viewLocalFiles ( ) {
2024-01-19 21:56:00 +01:00
if ( ! fs . existsSync ( documentsPath ) ) fs . mkdirSync ( documentsPath ) ;
2023-06-04 04:28:07 +02:00
const directory = {
name : "documents" ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
2024-01-19 21:56:00 +01:00
for ( const file of fs . readdirSync ( documentsPath ) ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( file ) === ".md" ) continue ;
2024-01-19 21:56:00 +01:00
const folderPath = path . resolve ( documentsPath , file ) ;
2023-06-08 06:31:35 +02:00
const isFolder = fs . lstatSync ( folderPath ) . isDirectory ( ) ;
2023-06-04 04:28:07 +02:00
if ( isFolder ) {
const subdocs = {
name : file ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
const subfiles = fs . readdirSync ( folderPath ) ;
for ( const subfile of subfiles ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( subfile ) !== ".json" ) continue ;
2023-06-04 04:28:07 +02:00
const filePath = path . join ( folderPath , subfile ) ;
2023-06-08 06:31:35 +02:00
const rawData = fs . readFileSync ( filePath , "utf8" ) ;
const cachefilename = ` ${ file } / ${ subfile } ` ;
const { pageContent , ... metadata } = JSON . parse ( rawData ) ;
2023-06-04 04:28:07 +02:00
subdocs . items . push ( {
name : subfile ,
type : "file" ,
... metadata ,
2023-06-08 06:31:35 +02:00
cached : await cachedVectorInformation ( cachefilename , true ) ,
} ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
directory . items . push ( subdocs ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
}
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
return directory ;
2023-06-04 04:28:07 +02:00
}
// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation ( filename = null , checkOnly = false ) {
if ( ! filename ) return checkOnly ? false : { exists : false , chunks : [ ] } ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2024-01-19 21:56:00 +01:00
const file = path . resolve ( vectorCachePath , ` ${ digest } .json ` ) ;
2023-06-04 04:28:07 +02:00
const exists = fs . existsSync ( file ) ;
2023-06-08 06:31:35 +02:00
if ( checkOnly ) return exists ;
if ( ! exists ) return { exists , chunks : [ ] } ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
console . log (
` Cached vectorized results of ${ filename } found! Using cached data to save on embed costs. `
) ;
const rawData = fs . readFileSync ( file , "utf8" ) ;
return { exists : true , chunks : JSON . parse ( rawData ) } ;
2023-06-04 04:28:07 +02:00
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult ( vectorData = [ ] , filename = null ) {
if ( ! filename ) return ;
2023-06-08 06:31:35 +02:00
console . log (
` Caching vectorized results of ${ filename } to prevent duplicated embedding. `
) ;
2024-01-19 21:56:00 +01:00
if ( ! fs . existsSync ( vectorCachePath ) ) fs . mkdirSync ( vectorCachePath ) ;
2023-06-04 04:28:07 +02:00
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2024-01-19 21:56:00 +01:00
const writeTo = path . resolve ( vectorCachePath , ` ${ digest } .json ` ) ;
2023-06-08 06:31:35 +02:00
fs . writeFileSync ( writeTo , JSON . stringify ( vectorData ) , "utf8" ) ;
2023-06-04 04:28:07 +02:00
return ;
}
2023-06-27 02:20:09 +02:00
// Purges a file from the documents/ folder.
async function purgeSourceDocument ( filename = null ) {
if ( ! filename ) return ;
2024-01-19 21:56:00 +01:00
const filePath = path . resolve ( documentsPath , normalizePath ( filename ) ) ;
if (
! fs . existsSync ( filePath ) ||
! isWithin ( documentsPath , filePath ) ||
! fs . lstatSync ( filePath ) . isFile ( )
)
return ;
2023-12-19 00:48:02 +01:00
console . log ( ` Purging source document of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache ( filename = null ) {
if ( ! filename ) return ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2024-01-19 21:56:00 +01:00
const filePath = path . resolve ( vectorCachePath , ` ${ digest } .json ` ) ;
2023-06-27 02:20:09 +02:00
2024-01-19 21:56:00 +01:00
if ( ! fs . existsSync ( filePath ) || ! fs . lstatSync ( filePath ) . isFile ( ) ) return ;
console . log ( ` Purging vector-cache of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
2024-01-16 23:58:49 +01:00
// Search for a specific document by its unique name in the entire `documents`
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments ( documentName = null ) {
if ( ! documentName ) return null ;
2024-01-19 21:56:00 +01:00
for ( const folder of fs . readdirSync ( documentsPath ) ) {
2024-01-16 23:58:49 +01:00
const isFolder = fs
2024-01-19 21:56:00 +01:00
. lstatSync ( path . join ( documentsPath , folder ) )
2024-01-16 23:58:49 +01:00
. isDirectory ( ) ;
if ( ! isFolder ) continue ;
const targetFilename = normalizePath ( documentName ) ;
2024-01-19 21:56:00 +01:00
const targetFileLocation = path . join ( documentsPath , folder , targetFilename ) ;
if (
! fs . existsSync ( targetFileLocation ) ||
! isWithin ( documentsPath , targetFileLocation )
)
continue ;
2024-01-16 23:58:49 +01:00
const fileData = fs . readFileSync ( targetFileLocation , "utf8" ) ;
const cachefilename = ` ${ folder } / ${ targetFilename } ` ;
const { pageContent , ... metadata } = JSON . parse ( fileData ) ;
return {
name : targetFilename ,
type : "file" ,
... metadata ,
cached : await cachedVectorInformation ( cachefilename , true ) ,
} ;
}
return null ;
}
2024-01-19 21:56:00 +01:00
/ * *
* Checks if a given path is within another path .
* @ param { string } outer - The outer path ( should be resolved ) .
* @ param { string } inner - The inner path ( should be resolved ) .
* @ returns { boolean } - Returns true if the inner path is within the outer path , false otherwise .
* /
function isWithin ( outer , inner ) {
if ( outer === inner ) return false ;
const rel = path . relative ( outer , inner ) ;
return ! rel . startsWith ( "../" ) && rel !== ".." ;
}
2024-01-15 01:36:17 +01:00
function normalizePath ( filepath = "" ) {
2024-01-19 21:56:00 +01:00
const result = path
. normalize ( filepath . trim ( ) )
. replace ( /^(\.\.(\/|\\|$))+/ , "" )
. trim ( ) ;
if ( [ ".." , "." , "/" ] . includes ( result ) ) throw new Error ( "Invalid path." ) ;
return result ;
2024-01-15 01:36:17 +01:00
}
2023-06-04 04:28:07 +02:00
module . exports = {
2024-01-16 23:58:49 +01:00
findDocumentInDocuments ,
2023-06-04 04:28:07 +02:00
cachedVectorInformation ,
viewLocalFiles ,
2023-06-27 02:20:09 +02:00
purgeSourceDocument ,
purgeVectorCache ,
2023-06-04 04:28:07 +02:00
storeVectorResult ,
2023-06-08 06:31:35 +02:00
fileData ,
2024-01-15 01:36:17 +01:00
normalizePath ,
2024-01-19 21:56:00 +01:00
isWithin ,
documentsPath ,
2023-06-08 06:31:35 +02:00
} ;