2023-06-08 06:31:35 +02:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const { v5 : uuidv5 } = require ( "uuid" ) ;
2023-06-04 04:28:07 +02:00
async function collectDocumentData ( folderName = null ) {
2023-06-08 06:31:35 +02:00
if ( ! folderName ) throw new Error ( "No docPath provided in request" ) ;
2023-06-08 23:00:43 +02:00
const folder =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/documents/ ${ folderName } ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` documents/ ${ folderName } ` ) ;
2023-06-04 04:28:07 +02:00
const dirExists = fs . existsSync ( folder ) ;
2023-06-08 06:31:35 +02:00
if ( ! dirExists )
throw new Error (
` No documents folder for ${ folderName } - did you run collector/main.py for this element? `
) ;
2023-06-04 04:28:07 +02:00
const files = fs . readdirSync ( folder ) ;
const fileData = [ ] ;
2023-06-08 06:31:35 +02:00
files . forEach ( ( file ) => {
if ( path . extname ( file ) === ".json" ) {
2023-06-04 04:28:07 +02:00
const filePath = path . join ( folder , file ) ;
2023-06-08 06:31:35 +02:00
const data = fs . readFileSync ( filePath , "utf8" ) ;
2023-06-04 04:28:07 +02:00
console . log ( ` Parsing document: ${ file } ` ) ;
2023-06-08 06:31:35 +02:00
fileData . push ( JSON . parse ( data ) ) ;
2023-06-04 04:28:07 +02:00
}
} ) ;
return fileData ;
}
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData ( filePath = null ) {
2023-06-08 06:31:35 +02:00
if ( ! filePath ) throw new Error ( "No docPath provided in request" ) ;
2023-06-08 23:00:43 +02:00
const fullPath =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/documents/ ${ filePath } ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` documents/ ${ filePath } ` ) ;
2023-06-04 04:28:07 +02:00
const fileExists = fs . existsSync ( fullPath ) ;
if ( ! fileExists ) return null ;
2023-06-08 06:31:35 +02:00
const data = fs . readFileSync ( fullPath , "utf8" ) ;
return JSON . parse ( data ) ;
2023-06-04 04:28:07 +02:00
}
async function viewLocalFiles ( ) {
2023-06-08 23:00:43 +02:00
const folder =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/documents ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` documents ` ) ;
2023-06-04 04:28:07 +02:00
const dirExists = fs . existsSync ( folder ) ;
2023-06-08 23:00:43 +02:00
if ( ! dirExists ) fs . mkdirSync ( folder ) ;
2023-06-04 04:28:07 +02:00
const directory = {
name : "documents" ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
for ( const file of fs . readdirSync ( folder ) ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( file ) === ".md" ) continue ;
2023-06-08 23:00:43 +02:00
const folderPath =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/documents/ ${ file } ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` documents/ ${ file } ` ) ;
2023-06-08 06:31:35 +02:00
const isFolder = fs . lstatSync ( folderPath ) . isDirectory ( ) ;
2023-06-04 04:28:07 +02:00
if ( isFolder ) {
const subdocs = {
name : file ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
const subfiles = fs . readdirSync ( folderPath ) ;
for ( const subfile of subfiles ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( subfile ) !== ".json" ) continue ;
2023-06-04 04:28:07 +02:00
const filePath = path . join ( folderPath , subfile ) ;
2023-06-08 06:31:35 +02:00
const rawData = fs . readFileSync ( filePath , "utf8" ) ;
const cachefilename = ` ${ file } / ${ subfile } ` ;
const { pageContent , ... metadata } = JSON . parse ( rawData ) ;
2023-06-04 04:28:07 +02:00
subdocs . items . push ( {
name : subfile ,
type : "file" ,
... metadata ,
2023-06-08 06:31:35 +02:00
cached : await cachedVectorInformation ( cachefilename , true ) ,
} ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
directory . items . push ( subdocs ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
}
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
return directory ;
2023-06-04 04:28:07 +02:00
}
// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation ( filename = null , checkOnly = false ) {
if ( ! filename ) return checkOnly ? false : { exists : false , chunks : [ ] } ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2023-06-08 23:00:43 +02:00
const file =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/vector-cache/ ${ digest } .json ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache/ ${ digest } .json ` ) ;
2023-06-04 04:28:07 +02:00
const exists = fs . existsSync ( file ) ;
2023-06-08 06:31:35 +02:00
if ( checkOnly ) return exists ;
if ( ! exists ) return { exists , chunks : [ ] } ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
console . log (
` Cached vectorized results of ${ filename } found! Using cached data to save on embed costs. `
) ;
const rawData = fs . readFileSync ( file , "utf8" ) ;
return { exists : true , chunks : JSON . parse ( rawData ) } ;
2023-06-04 04:28:07 +02:00
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult ( vectorData = [ ] , filename = null ) {
if ( ! filename ) return ;
2023-06-08 06:31:35 +02:00
console . log (
` Caching vectorized results of ${ filename } to prevent duplicated embedding. `
) ;
2023-06-08 23:00:43 +02:00
const folder =
process . env . NODE _ENV === "development"
2023-06-13 20:26:11 +02:00
? path . resolve ( _ _dirname , ` ../../storage/vector-cache ` )
2023-06-08 23:00:43 +02:00
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache ` ) ;
2023-06-04 04:28:07 +02:00
if ( ! fs . existsSync ( folder ) ) fs . mkdirSync ( folder ) ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
const writeTo = path . resolve ( folder , ` ${ digest } .json ` ) ;
2023-06-08 06:31:35 +02:00
fs . writeFileSync ( writeTo , JSON . stringify ( vectorData ) , "utf8" ) ;
2023-06-04 04:28:07 +02:00
return ;
}
2023-06-27 02:20:09 +02:00
// Purges a file from the documents/ folder.
async function purgeSourceDocument ( filename = null ) {
if ( ! filename ) return ;
2023-12-19 00:48:02 +01:00
console . log ( ` Purging source document of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
const filePath =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/documents ` , filename )
: path . resolve ( process . env . STORAGE _DIR , ` documents ` , filename ) ;
2023-12-19 00:48:02 +01:00
if ( ! fs . existsSync ( filePath ) ) return ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache ( filename = null ) {
if ( ! filename ) return ;
2023-12-19 00:48:02 +01:00
console . log ( ` Purging vector-cache of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
const filePath =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/vector-cache ` , ` ${ digest } .json ` )
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache ` , ` ${ digest } .json ` ) ;
2023-12-19 00:48:02 +01:00
if ( ! fs . existsSync ( filePath ) ) return ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
2023-06-04 04:28:07 +02:00
module . exports = {
cachedVectorInformation ,
collectDocumentData ,
viewLocalFiles ,
2023-06-27 02:20:09 +02:00
purgeSourceDocument ,
purgeVectorCache ,
2023-06-04 04:28:07 +02:00
storeVectorResult ,
2023-06-08 06:31:35 +02:00
fileData ,
} ;