2023-06-08 06:31:35 +02:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const { v5 : uuidv5 } = require ( "uuid" ) ;
2024-02-21 22:15:45 +01:00
const { Document } = require ( "../../models/documents" ) ;
2024-06-21 22:38:50 +02:00
const { DocumentSyncQueue } = require ( "../../models/documentSyncQueue" ) ;
2024-01-19 21:56:00 +01:00
const documentsPath =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/documents ` )
: path . resolve ( process . env . STORAGE _DIR , ` documents ` ) ;
const vectorCachePath =
process . env . NODE _ENV === "development"
? path . resolve ( _ _dirname , ` ../../storage/vector-cache ` )
: path . resolve ( process . env . STORAGE _DIR , ` vector-cache ` ) ;
2023-06-04 04:28:07 +02:00
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData ( filePath = null ) {
2023-06-08 06:31:35 +02:00
if ( ! filePath ) throw new Error ( "No docPath provided in request" ) ;
2024-01-19 21:56:00 +01:00
const fullFilePath = path . resolve ( documentsPath , normalizePath ( filePath ) ) ;
if ( ! fs . existsSync ( fullFilePath ) || ! isWithin ( documentsPath , fullFilePath ) )
return null ;
2023-06-08 23:00:43 +02:00
2024-01-19 21:56:00 +01:00
const data = fs . readFileSync ( fullFilePath , "utf8" ) ;
2023-06-08 06:31:35 +02:00
return JSON . parse ( data ) ;
2023-06-04 04:28:07 +02:00
}
async function viewLocalFiles ( ) {
2024-01-19 21:56:00 +01:00
if ( ! fs . existsSync ( documentsPath ) ) fs . mkdirSync ( documentsPath ) ;
2024-06-21 22:38:50 +02:00
const liveSyncAvailable = await DocumentSyncQueue . enabled ( ) ;
2023-06-04 04:28:07 +02:00
const directory = {
name : "documents" ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
2024-01-19 21:56:00 +01:00
for ( const file of fs . readdirSync ( documentsPath ) ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( file ) === ".md" ) continue ;
2024-01-19 21:56:00 +01:00
const folderPath = path . resolve ( documentsPath , file ) ;
2023-06-08 06:31:35 +02:00
const isFolder = fs . lstatSync ( folderPath ) . isDirectory ( ) ;
2023-06-04 04:28:07 +02:00
if ( isFolder ) {
const subdocs = {
name : file ,
type : "folder" ,
items : [ ] ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
const subfiles = fs . readdirSync ( folderPath ) ;
2024-09-25 00:55:54 +02:00
const filenames = { } ;
2023-06-04 04:28:07 +02:00
for ( const subfile of subfiles ) {
2023-06-08 06:31:35 +02:00
if ( path . extname ( subfile ) !== ".json" ) continue ;
2023-06-04 04:28:07 +02:00
const filePath = path . join ( folderPath , subfile ) ;
2023-06-08 06:31:35 +02:00
const rawData = fs . readFileSync ( filePath , "utf8" ) ;
const cachefilename = ` ${ file } / ${ subfile } ` ;
const { pageContent , ... metadata } = JSON . parse ( rawData ) ;
2023-06-04 04:28:07 +02:00
subdocs . items . push ( {
name : subfile ,
type : "file" ,
... metadata ,
2023-06-08 06:31:35 +02:00
cached : await cachedVectorInformation ( cachefilename , true ) ,
2024-06-21 22:38:50 +02:00
canWatch : liveSyncAvailable
? DocumentSyncQueue . canWatch ( metadata )
: false ,
2024-09-25 00:55:54 +02:00
// pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
// watched: false, // boolean to indicate if this document is watched in ANY workspace
2023-06-08 06:31:35 +02:00
} ) ;
2024-09-25 00:55:54 +02:00
filenames [ cachefilename ] = subfile ;
2023-06-04 04:28:07 +02:00
}
2024-09-25 00:55:54 +02:00
// Grab the pinned workspaces and watched documents for this folder's documents
// at the time of the query so we don't have to re-query the database for each file
const pinnedWorkspacesByDocument =
await getPinnedWorkspacesByDocument ( filenames ) ;
const watchedDocumentsFilenames =
await getWatchedDocumentFilenames ( filenames ) ;
for ( const item of subdocs . items ) {
item . pinnedWorkspaces = pinnedWorkspacesByDocument [ item . name ] || [ ] ;
item . watched =
watchedDocumentsFilenames . hasOwnProperty ( item . name ) || false ;
}
2023-06-08 06:31:35 +02:00
directory . items . push ( subdocs ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
}
2023-06-04 04:28:07 +02:00
2024-03-21 00:10:30 +01:00
// Make sure custom-documents is always the first folder in picker
directory . items = [
directory . items . find ( ( folder ) => folder . name === "custom-documents" ) ,
... directory . items . filter ( ( folder ) => folder . name !== "custom-documents" ) ,
2024-03-21 22:47:45 +01:00
] . filter ( ( i ) => ! ! i ) ;
2024-03-21 00:10:30 +01:00
2023-06-08 06:31:35 +02:00
return directory ;
2023-06-04 04:28:07 +02:00
}
2024-09-25 00:55:54 +02:00
/ * *
* Searches the vector - cache folder for existing information so we dont have to re - embed a
* document and can instead push directly to vector db .
* @ param { string } filename - the filename to check for cached vector information
* @ param { boolean } checkOnly - if true , only check if the file exists , do not return the cached data
* @ returns { Promise < { exists : boolean , chunks : any [ ] } > } - a promise that resolves to an object containing the existence of the file and its cached chunks
* /
2023-06-04 04:28:07 +02:00
async function cachedVectorInformation ( filename = null , checkOnly = false ) {
if ( ! filename ) return checkOnly ? false : { exists : false , chunks : [ ] } ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2024-01-19 21:56:00 +01:00
const file = path . resolve ( vectorCachePath , ` ${ digest } .json ` ) ;
2023-06-04 04:28:07 +02:00
const exists = fs . existsSync ( file ) ;
2023-06-08 06:31:35 +02:00
if ( checkOnly ) return exists ;
if ( ! exists ) return { exists , chunks : [ ] } ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
console . log (
` Cached vectorized results of ${ filename } found! Using cached data to save on embed costs. `
) ;
const rawData = fs . readFileSync ( file , "utf8" ) ;
return { exists : true , chunks : JSON . parse ( rawData ) } ;
2023-06-04 04:28:07 +02:00
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult ( vectorData = [ ] , filename = null ) {
if ( ! filename ) return ;
2023-06-08 06:31:35 +02:00
console . log (
` Caching vectorized results of ${ filename } to prevent duplicated embedding. `
) ;
2024-01-19 21:56:00 +01:00
if ( ! fs . existsSync ( vectorCachePath ) ) fs . mkdirSync ( vectorCachePath ) ;
2023-06-04 04:28:07 +02:00
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2024-01-19 21:56:00 +01:00
const writeTo = path . resolve ( vectorCachePath , ` ${ digest } .json ` ) ;
2023-06-08 06:31:35 +02:00
fs . writeFileSync ( writeTo , JSON . stringify ( vectorData ) , "utf8" ) ;
2023-06-04 04:28:07 +02:00
return ;
}
2023-06-27 02:20:09 +02:00
// Purges a file from the documents/ folder.
async function purgeSourceDocument ( filename = null ) {
if ( ! filename ) return ;
2024-01-19 21:56:00 +01:00
const filePath = path . resolve ( documentsPath , normalizePath ( filename ) ) ;
if (
! fs . existsSync ( filePath ) ||
! isWithin ( documentsPath , filePath ) ||
! fs . lstatSync ( filePath ) . isFile ( )
)
return ;
2023-12-19 00:48:02 +01:00
console . log ( ` Purging source document of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache ( filename = null ) {
if ( ! filename ) return ;
const digest = uuidv5 ( filename , uuidv5 . URL ) ;
2024-01-19 21:56:00 +01:00
const filePath = path . resolve ( vectorCachePath , ` ${ digest } .json ` ) ;
2023-06-27 02:20:09 +02:00
2024-01-19 21:56:00 +01:00
if ( ! fs . existsSync ( filePath ) || ! fs . lstatSync ( filePath ) . isFile ( ) ) return ;
console . log ( ` Purging vector-cache of ${ filename } . ` ) ;
2023-06-27 02:20:09 +02:00
fs . rmSync ( filePath ) ;
return ;
}
2024-01-16 23:58:49 +01:00
// Search for a specific document by its unique name in the entire `documents`
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments ( documentName = null ) {
if ( ! documentName ) return null ;
2024-01-19 21:56:00 +01:00
for ( const folder of fs . readdirSync ( documentsPath ) ) {
2024-01-16 23:58:49 +01:00
const isFolder = fs
2024-01-19 21:56:00 +01:00
. lstatSync ( path . join ( documentsPath , folder ) )
2024-01-16 23:58:49 +01:00
. isDirectory ( ) ;
if ( ! isFolder ) continue ;
const targetFilename = normalizePath ( documentName ) ;
2024-01-19 21:56:00 +01:00
const targetFileLocation = path . join ( documentsPath , folder , targetFilename ) ;
if (
! fs . existsSync ( targetFileLocation ) ||
! isWithin ( documentsPath , targetFileLocation )
)
continue ;
2024-01-16 23:58:49 +01:00
const fileData = fs . readFileSync ( targetFileLocation , "utf8" ) ;
const cachefilename = ` ${ folder } / ${ targetFilename } ` ;
const { pageContent , ... metadata } = JSON . parse ( fileData ) ;
return {
name : targetFilename ,
type : "file" ,
... metadata ,
cached : await cachedVectorInformation ( cachefilename , true ) ,
} ;
}
return null ;
}
2024-01-19 21:56:00 +01:00
/ * *
* Checks if a given path is within another path .
* @ param { string } outer - The outer path ( should be resolved ) .
* @ param { string } inner - The inner path ( should be resolved ) .
* @ returns { boolean } - Returns true if the inner path is within the outer path , false otherwise .
* /
function isWithin ( outer , inner ) {
if ( outer === inner ) return false ;
const rel = path . relative ( outer , inner ) ;
return ! rel . startsWith ( "../" ) && rel !== ".." ;
}
2024-01-15 01:36:17 +01:00
function normalizePath ( filepath = "" ) {
2024-01-19 21:56:00 +01:00
const result = path
. normalize ( filepath . trim ( ) )
. replace ( /^(\.\.(\/|\\|$))+/ , "" )
. trim ( ) ;
if ( [ ".." , "." , "/" ] . includes ( result ) ) throw new Error ( "Invalid path." ) ;
return result ;
2024-01-15 01:36:17 +01:00
}
2024-04-19 18:51:58 +02:00
// Check if the vector-cache folder is empty or not
// useful for it the user is changing embedders as this will
// break the previous cache.
function hasVectorCachedFiles ( ) {
try {
return (
fs . readdirSync ( vectorCachePath ) ? . filter ( ( name ) => name . endsWith ( ".json" ) )
. length !== 0
) ;
} catch { }
return false ;
}
2024-09-25 00:55:54 +02:00
/ * *
* @ param { string [ ] } filenames - array of filenames to check for pinned workspaces
* @ returns { Promise < Record < string , string [ ] >> } - a record of filenames and their corresponding workspaceIds
* /
async function getPinnedWorkspacesByDocument ( filenames = [ ] ) {
return (
await Document . where (
{
docpath : {
in : Object . keys ( filenames ) ,
} ,
pinned : true ,
} ,
null ,
null ,
null ,
{
workspaceId : true ,
docpath : true ,
}
)
) . reduce ( ( result , { workspaceId , docpath } ) => {
const filename = filenames [ docpath ] ;
if ( ! result [ filename ] ) result [ filename ] = [ ] ;
if ( ! result [ filename ] . includes ( workspaceId ) )
result [ filename ] . push ( workspaceId ) ;
return result ;
} , { } ) ;
}
/ * *
* Get a record of filenames and their corresponding workspaceIds that have watched a document
* that will be used to determine if a document should be displayed in the watched documents sidebar
* @ param { string [ ] } filenames - array of filenames to check for watched workspaces
* @ returns { Promise < Record < string , string [ ] >> } - a record of filenames and their corresponding workspaceIds
* /
async function getWatchedDocumentFilenames ( filenames = [ ] ) {
return (
await Document . where (
{
docpath : { in : Object . keys ( filenames ) } ,
watched : true ,
} ,
null ,
null ,
null ,
{ workspaceId : true , docpath : true }
)
) . reduce ( ( result , { workspaceId , docpath } ) => {
const filename = filenames [ docpath ] ;
result [ filename ] = workspaceId ;
return result ;
} , { } ) ;
}
2023-06-04 04:28:07 +02:00
module . exports = {
2024-01-16 23:58:49 +01:00
findDocumentInDocuments ,
2023-06-04 04:28:07 +02:00
cachedVectorInformation ,
viewLocalFiles ,
2023-06-27 02:20:09 +02:00
purgeSourceDocument ,
purgeVectorCache ,
2023-06-04 04:28:07 +02:00
storeVectorResult ,
2023-06-08 06:31:35 +02:00
fileData ,
2024-01-15 01:36:17 +01:00
normalizePath ,
2024-01-19 21:56:00 +01:00
isWithin ,
documentsPath ,
2024-04-19 18:51:58 +02:00
hasVectorCachedFiles ,
2023-06-08 06:31:35 +02:00
} ;