2023-06-08 06:31:35 +02:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const { v5 : uuidv5 } = require ( "uuid" ) ;
2023-06-04 04:28:07 +02:00
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData ( filePath = null ) {
2023-06-08 06:31:35 +02:00
if ( ! filePath ) throw new Error ( "No docPath provided in request" ) ;
2023-06-08 23:00:43 +02:00
const fullPath =
process . env . NODE _ENV === "development"
2024-01-15 01:36:17 +01:00
? path . resolve (
_ _dirname ,
` ../../storage/documents/ ${ normalizePath ( filePath ) } `
)
: path . resolve (
process . env . STORAGE _DIR ,
` documents/ ${ normalizePath ( filePath ) } `
) ;
2023-06-04 04:28:07 +02:00
const fileExists = fs . existsSync ( fullPath ) ;
if ( ! fileExists ) return null ;
2023-06-08 06:31:35 +02:00
const data = fs . readFileSync ( fullPath , "utf8" ) ;
return JSON . parse ( data ) ;
2023-06-04 04:28:07 +02:00
}
async function viewLocalFiles ( ) {
2023-06-08 23:00:43 +02:00
const folder =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/documents ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` documents ` ) ;
2023-06-04 04:28:07 +02:00
const dirExists = fs . existsSync ( folder ) ;
2023-06-08 23:00:43 +02:00
if ( ! dirExists ) fs . mkdirSync ( folder ) ;
2023-06-04 04:28:07 +02:00
const directory = {
name : "documents" ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
for ( const file of fs . readdirSync ( folder ) ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( file ) === ".md" ) continue ;
2023-06-08 23:00:43 +02:00
const folderPath =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/documents/ ${ file } ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` documents/ ${ file } ` ) ;
2023-06-08 06:31:35 +02:00
const isFolder = fs . lstatSync ( folderPath ) . isDirectory ( ) ;
2023-06-04 04:28:07 +02:00
if ( isFolder ) {
const subdocs = {
name : file ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
const subfiles = fs . readdirSync ( folderPath ) ;
for ( const subfile of subfiles ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( subfile ) !== ".json" ) continue ;
2023-06-04 04:28:07 +02:00
const filePath = path . join ( folderPath , subfile ) ;
2023-06-08 06:31:35 +02:00
const rawData = fs . readFileSync ( filePath , "utf8" ) ;
const cachefilename = ` ${ file } / ${ subfile } ` ;
const { pageContent , ... metadata } = JSON . parse ( rawData ) ;
2023-06-04 04:28:07 +02:00
subdocs . items . push ( {
name : subfile ,
type : "file" ,
... metadata ,
2023-06-08 06:31:35 +02:00
cached : await cachedVectorInformation ( cachefilename , true ) ,
} ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
directory . items . push ( subdocs ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
}
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
return directory ;
2023-06-04 04:28:07 +02:00
}
// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation ( filename = null , checkOnly = false ) {
if ( ! filename ) return checkOnly ? false : { exists : false , chunks : [ ] } ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2023-06-08 23:00:43 +02:00
const file =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/vector-cache/ ${ digest } .json ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache/ ${ digest } .json ` ) ;
2023-06-04 04:28:07 +02:00
const exists = fs . existsSync ( file ) ;
2023-06-08 06:31:35 +02:00
if ( checkOnly ) return exists ;
if ( ! exists ) return { exists , chunks : [ ] } ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
console . log (
` Cached vectorized results of ${ filename } found! Using cached data to save on embed costs. `
) ;
const rawData = fs . readFileSync ( file , "utf8" ) ;
return { exists : true , chunks : JSON . parse ( rawData ) } ;
2023-06-04 04:28:07 +02:00
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult ( vectorData = [ ] , filename = null ) {
if ( ! filename ) return ;
2023-06-08 06:31:35 +02:00
console . log (
` Caching vectorized results of ${ filename } to prevent duplicated embedding. `
) ;
2023-06-08 23:00:43 +02:00
const folder =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/vector-cache ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache ` ) ;
2023-06-04 04:28:07 +02:00
if ( ! fs . existsSync ( folder ) ) fs . mkdirSync ( folder ) ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
const writeTo = path . resolve ( folder , ` ${ digest } .json ` ) ;
2023-06-08 06:31:35 +02:00
fs . writeFileSync ( writeTo , JSON . stringify ( vectorData ) , "utf8" ) ;
2023-06-04 04:28:07 +02:00
return ;
}
2023-06-27 02:20:09 +02:00
// Purges a file from the documents/ folder.
async function purgeSourceDocument ( filename = null ) {
if ( ! filename ) return ;
2023-12-19 00:48:02 +01:00
console . log ( ` Purging source document of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
const filePath =
process . env . NODE _ENV === "development"
2024-01-15 01:36:17 +01:00
? path . resolve (
_ _dirname ,
` ../../storage/documents ` ,
normalizePath ( filename )
)
: path . resolve (
process . env . STORAGE _DIR ,
` documents ` ,
normalizePath ( filename )
) ;
2023-06-27 02:20:09 +02:00
2023-12-19 00:48:02 +01:00
if ( ! fs . existsSync ( filePath ) ) return ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache ( filename = null ) {
if ( ! filename ) return ;
2023-12-19 00:48:02 +01:00
console . log ( ` Purging vector-cache of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
const filePath =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/vector-cache ` , ` ${ digest } .json ` )
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache ` , ` ${ digest } .json ` ) ;
2023-12-19 00:48:02 +01:00
if ( ! fs . existsSync ( filePath ) ) return ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
2024-01-16 23:58:49 +01:00
// Search for a specific document by its unique name in the entire `documents`
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments ( documentName = null ) {
if ( ! documentName ) return null ;
const documentsFolder =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/documents ` )
: path . resolve ( process . env . STORAGE _DIR , ` documents ` ) ;
for ( const folder of fs . readdirSync ( documentsFolder ) ) {
const isFolder = fs
. lstatSync ( path . join ( documentsFolder , folder ) )
. isDirectory ( ) ;
if ( ! isFolder ) continue ;
const targetFilename = normalizePath ( documentName ) ;
const targetFileLocation = path . join (
documentsFolder ,
folder ,
targetFilename
) ;
if ( ! fs . existsSync ( targetFileLocation ) ) continue ;
const fileData = fs . readFileSync ( targetFileLocation , "utf8" ) ;
const cachefilename = ` ${ folder } / ${ targetFilename } ` ;
const { pageContent , ... metadata } = JSON . parse ( fileData ) ;
return {
name : targetFilename ,
type : "file" ,
... metadata ,
cached : await cachedVectorInformation ( cachefilename , true ) ,
} ;
}
return null ;
}
2024-01-15 01:36:17 +01:00
function normalizePath ( filepath = "" ) {
return path . normalize ( filepath ) . replace ( /^(\.\.(\/|\\|$))+/ , "" ) ;
}
2023-06-04 04:28:07 +02:00
module . exports = {
2024-01-16 23:58:49 +01:00
findDocumentInDocuments ,
2023-06-04 04:28:07 +02:00
cachedVectorInformation ,
viewLocalFiles ,
2023-06-27 02:20:09 +02:00
purgeSourceDocument ,
purgeVectorCache ,
2023-06-04 04:28:07 +02:00
storeVectorResult ,
2023-06-08 06:31:35 +02:00
fileData ,
2024-01-15 01:36:17 +01:00
normalizePath ,
2023-06-08 06:31:35 +02:00
} ;