2023-06-09 03:40:29 +02:00
const lancedb = require ( "vectordb" ) ;
2023-06-09 03:58:26 +02:00
const { toChunks } = require ( "../../helpers" ) ;
2023-06-09 03:40:29 +02:00
const { OpenAIEmbeddings } = require ( "langchain/embeddings/openai" ) ;
const { RecursiveCharacterTextSplitter } = require ( "langchain/text_splitter" ) ;
2023-06-09 03:58:26 +02:00
const { storeVectorResult , cachedVectorInformation } = require ( "../../files" ) ;
2023-06-09 03:40:29 +02:00
const { Configuration , OpenAIApi } = require ( "openai" ) ;
const { v4 : uuidv4 } = require ( "uuid" ) ;
// Since we roll our own results for prompting we
// have to manually curate sources as well.
function curateLanceSources ( sources = [ ] ) {
const knownDocs = [ ] ;
const documents = [ ] ;
for ( const source of sources ) {
const { text : _t , vector : _v , score : _s , ... metadata } = source ;
if (
Object . keys ( metadata ) . length > 0 &&
! knownDocs . includes ( metadata . title )
) {
documents . push ( { ... metadata } ) ;
knownDocs . push ( metadata . title ) ;
}
}
return documents ;
}
const LanceDb = {
2023-06-09 20:27:27 +02:00
uri : ` ${
2023-06-13 20:26:11 +02:00
! ! process . env . STORAGE _DIR ? ` ${ process . env . STORAGE _DIR } / ` : "./storage/"
2023-06-09 20:27:27 +02:00
} lancedb ` ,
2023-06-09 03:40:29 +02:00
name : "LanceDb" ,
connect : async function ( ) {
if ( process . env . VECTOR _DB !== "lancedb" )
throw new Error ( "LanceDB::Invalid ENV settings" ) ;
const client = await lancedb . connect ( this . uri ) ;
return { client } ;
} ,
heartbeat : async function ( ) {
await this . connect ( ) ;
return { heartbeat : Number ( new Date ( ) ) } ;
} ,
totalIndicies : async function ( ) {
return 0 ; // Unsupported for LanceDB - so always zero
} ,
embeddingFunc : function ( ) {
return new lancedb . OpenAIEmbeddingFunction (
"context" ,
process . env . OPEN _AI _KEY
) ;
} ,
embedder : function ( ) {
return new OpenAIEmbeddings ( { openAIApiKey : process . env . OPEN _AI _KEY } ) ;
} ,
openai : function ( ) {
const config = new Configuration ( { apiKey : process . env . OPEN _AI _KEY } ) ;
const openai = new OpenAIApi ( config ) ;
return openai ;
} ,
embedChunk : async function ( openai , textChunk ) {
const {
data : { data } ,
} = await openai . createEmbedding ( {
model : "text-embedding-ada-002" ,
input : textChunk ,
} ) ;
return data . length > 0 && data [ 0 ] . hasOwnProperty ( "embedding" )
? data [ 0 ] . embedding
: null ;
} ,
getChatCompletion : async function ( openai , messages = [ ] ) {
const model = process . env . OPEN _MODEL _PREF || "gpt-3.5-turbo" ;
const { data } = await openai . createChatCompletion ( {
model ,
messages ,
} ) ;
if ( ! data . hasOwnProperty ( "choices" ) ) return null ;
return data . choices [ 0 ] . message . content ;
} ,
namespace : async function ( client , namespace = null ) {
if ( ! namespace ) throw new Error ( "No namespace value provided." ) ;
const collection = await client . openTable ( namespace ) . catch ( ( ) => false ) ;
if ( ! collection ) return null ;
return {
... collection ,
} ;
} ,
updateOrCreateCollection : async function ( client , data = [ ] , namespace ) {
if ( await this . hasNamespace ( namespace ) ) {
const collection = await client . openTable ( namespace ) ;
const result = await collection . add ( data ) ;
console . log ( { result } ) ;
return true ;
}
const result = await client . createTable ( namespace , data ) ;
console . log ( { result } ) ;
return true ;
} ,
hasNamespace : async function ( namespace = null ) {
if ( ! namespace ) return false ;
const { client } = await this . connect ( ) ;
const exists = await this . namespaceExists ( client , namespace ) ;
return exists ;
} ,
namespaceExists : async function ( client , namespace = null ) {
if ( ! namespace ) throw new Error ( "No namespace value provided." ) ;
const collections = await client . tableNames ( ) ;
return collections . includes ( namespace ) ;
} ,
deleteVectorsInNamespace : async function ( client , namespace = null ) {
const fs = require ( "fs" ) ;
fs . rm ( ` ${ client . uri } / ${ namespace } .lance ` , { recursive : true } , ( ) => null ) ;
return true ;
} ,
deleteDocumentFromNamespace : async function ( _namespace , _docId ) {
console . error (
` LanceDB:deleteDocumentFromNamespace - unsupported operation. No changes made to vector db. `
) ;
return false ;
} ,
addDocumentToNamespace : async function (
namespace ,
documentData = { } ,
fullFilePath = null
) {
2023-06-09 03:58:26 +02:00
const { DocumentVectors } = require ( "../../../models/vectors" ) ;
2023-06-09 03:40:29 +02:00
try {
const { pageContent , docId , ... metadata } = documentData ;
if ( ! pageContent || pageContent . length == 0 ) return false ;
console . log ( "Adding new vectorized document into namespace" , namespace ) ;
const cacheResult = await cachedVectorInformation ( fullFilePath ) ;
if ( cacheResult . exists ) {
const { client } = await this . connect ( ) ;
const { chunks } = cacheResult ;
const documentVectors = [ ] ;
const submissions = [ ] ;
for ( const chunk of chunks ) {
chunk . forEach ( ( chunk ) => {
const id = uuidv4 ( ) ;
const { id : _id , ... metadata } = chunk . metadata ;
documentVectors . push ( { docId , vectorId : id } ) ;
submissions . push ( { id : id , vector : chunk . values , ... metadata } ) ;
} ) ;
}
console . log ( submissions ) ;
await this . updateOrCreateCollection ( client , submissions , namespace ) ;
await DocumentVectors . bulkInsert ( documentVectors ) ;
return true ;
}
// If we are here then we are going to embed and store a novel document.
// We have to do this manually as opposed to using LangChains `xyz.fromDocuments`
// because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb.
const textSplitter = new RecursiveCharacterTextSplitter ( {
chunkSize : 1000 ,
chunkOverlap : 20 ,
} ) ;
const textChunks = await textSplitter . splitText ( pageContent ) ;
console . log ( "Chunks created from document:" , textChunks . length ) ;
const documentVectors = [ ] ;
const vectors = [ ] ;
const submissions = [ ] ;
const openai = this . openai ( ) ;
for ( const textChunk of textChunks ) {
const vectorValues = await this . embedChunk ( openai , textChunk ) ;
if ( ! ! vectorValues ) {
const vectorRecord = {
id : uuidv4 ( ) ,
values : vectorValues ,
// [DO NOT REMOVE]
// LangChain will be unable to find your text if you embed manually and dont include the `text` key.
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64
metadata : { ... metadata , text : textChunk } ,
} ;
vectors . push ( vectorRecord ) ;
submissions . push ( {
id : vectorRecord . id ,
vector : vectorRecord . values ,
... vectorRecord . metadata ,
} ) ;
documentVectors . push ( { docId , vectorId : vectorRecord . id } ) ;
} else {
console . error (
"Could not use OpenAI to embed document chunk! This document will not be recorded."
) ;
}
}
if ( vectors . length > 0 ) {
const chunks = [ ] ;
for ( const chunk of toChunks ( vectors , 500 ) ) chunks . push ( chunk ) ;
console . log ( "Inserting vectorized chunks into LanceDB collection." ) ;
const { client } = await this . connect ( ) ;
await this . updateOrCreateCollection ( client , submissions , namespace ) ;
await storeVectorResult ( chunks , fullFilePath ) ;
}
await DocumentVectors . bulkInsert ( documentVectors ) ;
return true ;
} catch ( e ) {
console . error ( "addDocumentToNamespace" , e . message ) ;
return false ;
}
} ,
query : async function ( reqBody = { } ) {
const { namespace = null , input } = reqBody ;
if ( ! namespace || ! input ) throw new Error ( "Invalid request body" ) ;
const { client } = await this . connect ( ) ;
if ( ! ( await this . namespaceExists ( client , namespace ) ) ) {
return {
response : null ,
sources : [ ] ,
message : "Invalid query - no documents found for workspace!" ,
} ;
}
// LanceDB does not have langchainJS support so we roll our own here.
const queryVector = await this . embedChunk ( this . openai ( ) , input ) ;
const collection = await client . openTable ( namespace ) ;
const relevantResults = await collection
. search ( queryVector )
. metricType ( "cosine" )
. limit ( 2 )
. execute ( ) ;
const messages = [
{
role : "system" ,
content : ` The following is a friendly conversation between a human and an AI. The AI is very casual and talkative and responds with a friendly tone. If the AI does not know the answer to a question, it truthfully says it does not know.
Relevant pieces of information for context of the current query :
$ { relevantResults . map ( ( result ) => result . text ) . join ( "\n\n" ) } ` ,
} ,
{ role : "user" , content : input } ,
] ;
const responseText = await this . getChatCompletion ( this . openai ( ) , messages ) ;
return {
response : responseText ,
sources : curateLanceSources ( relevantResults ) ,
message : false ,
} ;
} ,
"namespace-stats" : async function ( reqBody = { } ) {
const { namespace = null } = reqBody ;
if ( ! namespace ) throw new Error ( "namespace required" ) ;
const { client } = await this . connect ( ) ;
if ( ! ( await this . namespaceExists ( client , namespace ) ) )
throw new Error ( "Namespace by that name does not exist." ) ;
const stats = await this . namespace ( client , namespace ) ;
return stats
? stats
: { message : "No stats were able to be fetched from DB for namespace" } ;
} ,
"delete-namespace" : async function ( reqBody = { } ) {
const { namespace = null } = reqBody ;
const { client } = await this . connect ( ) ;
if ( ! ( await this . namespaceExists ( client , namespace ) ) )
throw new Error ( "Namespace by that name does not exist." ) ;
await this . deleteVectorsInNamespace ( client , namespace ) ;
return {
message : ` Namespace ${ namespace } was deleted. ` ,
} ;
} ,
reset : async function ( ) {
const { client } = await this . connect ( ) ;
const fs = require ( "fs" ) ;
fs . rm ( ` ${ client . uri } ` , { recursive : true } , ( ) => null ) ;
return { reset : true } ;
} ,
} ;
2023-06-09 20:27:27 +02:00
module . exports . LanceDb = LanceDb ;