2023-06-04 04:28:07 +02:00
const { PineconeClient } = require ( "@pinecone-database/pinecone" ) ;
const { PineconeStore } = require ( "langchain/vectorstores/pinecone" ) ;
const { OpenAI } = require ( "langchain/llms/openai" ) ;
2023-06-08 22:13:48 +02:00
const { VectorDBQAChain , LLMChain } = require ( "langchain/chains" ) ;
2023-06-04 04:28:07 +02:00
const { OpenAIEmbeddings } = require ( "langchain/embeddings/openai" ) ;
2023-06-08 22:13:48 +02:00
const { VectorStoreRetrieverMemory } = require ( "langchain/memory" ) ;
2023-06-04 04:28:07 +02:00
const { PromptTemplate } = require ( "langchain/prompts" ) ;
const { RecursiveCharacterTextSplitter } = require ( "langchain/text_splitter" ) ;
2023-06-09 03:58:26 +02:00
const { storeVectorResult , cachedVectorInformation } = require ( "../../files" ) ;
2023-06-08 06:31:35 +02:00
const { Configuration , OpenAIApi } = require ( "openai" ) ;
const { v4 : uuidv4 } = require ( "uuid" ) ;
2023-06-09 03:58:26 +02:00
const { toChunks , curateSources } = require ( "../../helpers" ) ;
2023-06-04 04:28:07 +02:00
const Pinecone = {
2023-06-08 22:13:48 +02:00
name : "Pinecone" ,
2023-06-04 04:28:07 +02:00
connect : async function ( ) {
2023-06-08 22:13:48 +02:00
if ( process . env . VECTOR _DB !== "pinecone" )
throw new Error ( "Pinecone::Invalid ENV settings" ) ;
2023-06-04 04:28:07 +02:00
const client = new PineconeClient ( ) ;
await client . init ( {
apiKey : process . env . PINECONE _API _KEY ,
environment : process . env . PINECONE _ENVIRONMENT ,
} ) ;
const pineconeIndex = client . Index ( process . env . PINECONE _INDEX ) ;
2023-06-08 06:31:35 +02:00
const { status } = await client . describeIndex ( {
indexName : process . env . PINECONE _INDEX ,
} ) ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
if ( ! status . ready ) throw new Error ( "Pinecode::Index not ready." ) ;
2023-06-04 04:28:07 +02:00
return { client , pineconeIndex , indexName : process . env . PINECONE _INDEX } ;
} ,
embedder : function ( ) {
return new OpenAIEmbeddings ( { openAIApiKey : process . env . OPEN _AI _KEY } ) ;
} ,
openai : function ( ) {
2023-06-08 06:31:35 +02:00
const config = new Configuration ( { apiKey : process . env . OPEN _AI _KEY } ) ;
2023-06-04 04:28:07 +02:00
const openai = new OpenAIApi ( config ) ;
2023-06-08 06:31:35 +02:00
return openai ;
2023-06-04 04:28:07 +02:00
} ,
embedChunk : async function ( openai , textChunk ) {
2023-06-08 06:31:35 +02:00
const {
data : { data } ,
} = await openai . createEmbedding ( {
model : "text-embedding-ada-002" ,
input : textChunk ,
} ) ;
return data . length > 0 && data [ 0 ] . hasOwnProperty ( "embedding" )
? data [ 0 ] . embedding
: null ;
2023-06-04 04:28:07 +02:00
} ,
2023-06-15 08:12:59 +02:00
llm : function ( { temperature = 0.7 } ) {
2023-06-08 06:31:35 +02:00
const model = process . env . OPEN _MODEL _PREF || "gpt-3.5-turbo" ;
return new OpenAI ( {
openAIApiKey : process . env . OPEN _AI _KEY ,
modelName : model ,
2023-06-15 08:12:59 +02:00
temperature ,
2023-06-08 06:31:35 +02:00
} ) ;
2023-06-04 04:28:07 +02:00
} ,
totalIndicies : async function ( ) {
const { pineconeIndex } = await this . connect ( ) ;
const { namespaces } = await pineconeIndex . describeIndexStats1 ( ) ;
2023-06-08 06:31:35 +02:00
return Object . values ( namespaces ) . reduce (
( a , b ) => a + ( b ? . vectorCount || 0 ) ,
0
) ;
2023-06-04 04:28:07 +02:00
} ,
namespace : async function ( index , namespace = null ) {
if ( ! namespace ) throw new Error ( "No namespace value provided." ) ;
const { namespaces } = await index . describeIndexStats1 ( ) ;
2023-06-08 06:31:35 +02:00
return namespaces . hasOwnProperty ( namespace ) ? namespaces [ namespace ] : null ;
2023-06-04 04:28:07 +02:00
} ,
hasNamespace : async function ( namespace = null ) {
if ( ! namespace ) return false ;
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
return await this . namespaceExists ( pineconeIndex , namespace ) ;
2023-06-04 04:28:07 +02:00
} ,
namespaceExists : async function ( index , namespace = null ) {
if ( ! namespace ) throw new Error ( "No namespace value provided." ) ;
const { namespaces } = await index . describeIndexStats1 ( ) ;
2023-06-08 06:31:35 +02:00
return namespaces . hasOwnProperty ( namespace ) ;
2023-06-04 04:28:07 +02:00
} ,
deleteVectorsInNamespace : async function ( index , namespace = null ) {
2023-06-08 06:31:35 +02:00
await index . delete1 ( { namespace , deleteAll : true } ) ;
return true ;
2023-06-04 04:28:07 +02:00
} ,
2023-06-08 06:31:35 +02:00
addDocumentToNamespace : async function (
namespace ,
documentData = { } ,
fullFilePath = null
) {
2023-06-09 03:58:26 +02:00
const { DocumentVectors } = require ( "../../../models/vectors" ) ;
2023-06-04 04:28:07 +02:00
try {
2023-06-08 06:31:35 +02:00
const { pageContent , docId , ... metadata } = documentData ;
2023-06-04 04:28:07 +02:00
if ( ! pageContent || pageContent . length == 0 ) return false ;
console . log ( "Adding new vectorized document into namespace" , namespace ) ;
2023-06-08 06:31:35 +02:00
const cacheResult = await cachedVectorInformation ( fullFilePath ) ;
2023-06-04 04:28:07 +02:00
if ( cacheResult . exists ) {
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
const { chunks } = cacheResult ;
const documentVectors = [ ] ;
2023-06-04 04:28:07 +02:00
for ( const chunk of chunks ) {
// Before sending to Pinecone and saving the records to our db
// we need to assign the id of each chunk that is stored in the cached file.
const newChunks = chunk . map ( ( chunk ) => {
2023-06-08 06:31:35 +02:00
const id = uuidv4 ( ) ;
2023-06-04 04:28:07 +02:00
documentVectors . push ( { docId , vectorId : id } ) ;
2023-06-08 06:31:35 +02:00
return { ... chunk , id } ;
} ) ;
2023-06-04 04:28:07 +02:00
// Push chunks with new ids to pinecone.
await pineconeIndex . upsert ( {
upsertRequest : {
vectors : [ ... newChunks ] ,
namespace ,
2023-06-08 06:31:35 +02:00
} ,
} ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
await DocumentVectors . bulkInsert ( documentVectors ) ;
return true ;
2023-06-04 04:28:07 +02:00
}
// If we are here then we are going to embed and store a novel document.
// We have to do this manually as opposed to using LangChains `PineconeStore.fromDocuments`
// because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb.
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
2023-06-08 06:31:35 +02:00
const textSplitter = new RecursiveCharacterTextSplitter ( {
chunkSize : 1000 ,
chunkOverlap : 20 ,
} ) ;
const textChunks = await textSplitter . splitText ( pageContent ) ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
console . log ( "Chunks created from document:" , textChunks . length ) ;
const documentVectors = [ ] ;
const vectors = [ ] ;
const openai = this . openai ( ) ;
2023-06-04 04:28:07 +02:00
for ( const textChunk of textChunks ) {
const vectorValues = await this . embedChunk ( openai , textChunk ) ;
if ( ! ! vectorValues ) {
const vectorRecord = {
id : uuidv4 ( ) ,
values : vectorValues ,
// [DO NOT REMOVE]
// LangChain will be unable to find your text if you embed manually and dont include the `text` key.
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64
metadata : { ... metadata , text : textChunk } ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
vectors . push ( vectorRecord ) ;
documentVectors . push ( { docId , vectorId : vectorRecord . id } ) ;
} else {
2023-06-08 06:31:35 +02:00
console . error (
"Could not use OpenAI to embed document chunk! This document will not be recorded."
) ;
2023-06-04 04:28:07 +02:00
}
}
if ( vectors . length > 0 ) {
2023-06-08 06:31:35 +02:00
const chunks = [ ] ;
2023-06-04 04:28:07 +02:00
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
console . log ( "Inserting vectorized chunks into Pinecone." ) ;
2023-06-04 04:28:07 +02:00
for ( const chunk of toChunks ( vectors , 100 ) ) {
2023-06-08 06:31:35 +02:00
chunks . push ( chunk ) ;
2023-06-04 04:28:07 +02:00
await pineconeIndex . upsert ( {
upsertRequest : {
vectors : [ ... chunk ] ,
namespace ,
2023-06-08 06:31:35 +02:00
} ,
} ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
await storeVectorResult ( chunks , fullFilePath ) ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
await DocumentVectors . bulkInsert ( documentVectors ) ;
2023-06-04 04:28:07 +02:00
return true ;
} catch ( e ) {
2023-06-08 06:31:35 +02:00
console . error ( "addDocumentToNamespace" , e . message ) ;
2023-06-04 04:28:07 +02:00
return false ;
}
} ,
deleteDocumentFromNamespace : async function ( namespace , docId ) {
2023-06-09 03:58:26 +02:00
const { DocumentVectors } = require ( "../../../models/vectors" ) ;
2023-06-04 04:28:07 +02:00
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
if ( ! ( await this . namespaceExists ( pineconeIndex , namespace ) ) ) return ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
const knownDocuments = await DocumentVectors . where ( ` docId = ' ${ docId } ' ` ) ;
2023-06-04 04:28:07 +02:00
if ( knownDocuments . length === 0 ) return ;
const vectorIds = knownDocuments . map ( ( doc ) => doc . vectorId ) ;
await pineconeIndex . delete1 ( {
ids : vectorIds ,
namespace ,
2023-06-08 06:31:35 +02:00
} ) ;
2023-06-04 04:28:07 +02:00
const indexes = knownDocuments . map ( ( doc ) => doc . id ) ;
2023-06-08 06:31:35 +02:00
await DocumentVectors . deleteIds ( indexes ) ;
2023-06-04 04:28:07 +02:00
return true ;
} ,
2023-06-08 06:31:35 +02:00
"namespace-stats" : async function ( reqBody = { } ) {
const { namespace = null } = reqBody ;
2023-06-04 04:28:07 +02:00
if ( ! namespace ) throw new Error ( "namespace required" ) ;
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
if ( ! ( await this . namespaceExists ( pineconeIndex , namespace ) ) )
throw new Error ( "Namespace by that name does not exist." ) ;
const stats = await this . namespace ( pineconeIndex , namespace ) ;
return stats
? stats
: { message : "No stats were able to be fetched from DB" } ;
2023-06-04 04:28:07 +02:00
} ,
2023-06-08 06:31:35 +02:00
"delete-namespace" : async function ( reqBody = { } ) {
const { namespace = null } = reqBody ;
2023-06-04 04:28:07 +02:00
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
if ( ! ( await this . namespaceExists ( pineconeIndex , namespace ) ) )
throw new Error ( "Namespace by that name does not exist." ) ;
2023-06-04 04:28:07 +02:00
const details = await this . namespace ( pineconeIndex , namespace ) ;
await this . deleteVectorsInNamespace ( pineconeIndex , namespace ) ;
2023-06-08 06:31:35 +02:00
return {
message : ` Namespace ${ namespace } was deleted along with ${ details . vectorCount } vectors. ` ,
} ;
2023-06-04 04:28:07 +02:00
} ,
query : async function ( reqBody = { } ) {
2023-06-15 08:12:59 +02:00
const { namespace = null , input , workspace = { } } = reqBody ;
2023-06-04 04:28:07 +02:00
if ( ! namespace || ! input ) throw new Error ( "Invalid request body" ) ;
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
if ( ! ( await this . namespaceExists ( pineconeIndex , namespace ) ) ) {
2023-06-04 04:28:07 +02:00
return {
2023-06-08 06:31:35 +02:00
response : null ,
sources : [ ] ,
message : "Invalid query - no documents found for workspace!" ,
} ;
2023-06-04 04:28:07 +02:00
}
2023-06-08 06:31:35 +02:00
const vectorStore = await PineconeStore . fromExistingIndex ( this . embedder ( ) , {
pineconeIndex ,
namespace ,
} ) ;
2023-06-04 04:28:07 +02:00
2023-06-15 08:12:59 +02:00
const model = this . llm ( {
temperature : workspace ? . openAiTemp ,
} ) ;
2023-06-04 04:28:07 +02:00
const chain = VectorDBQAChain . fromLLM ( model , vectorStore , {
k : 5 ,
returnSourceDocuments : true ,
} ) ;
const response = await chain . call ( { query : input } ) ;
2023-06-08 06:31:35 +02:00
return {
response : response . text ,
sources : curateSources ( response . sourceDocuments ) ,
message : false ,
} ;
2023-06-04 04:28:07 +02:00
} ,
// This implementation of chat also expands the memory of the chat itself
// and adds more tokens to the PineconeDB instance namespace
chat : async function ( reqBody = { } ) {
2023-06-15 08:12:59 +02:00
const { namespace = null , input , workspace = { } } = reqBody ;
2023-06-04 04:28:07 +02:00
if ( ! namespace || ! input ) throw new Error ( "Invalid request body" ) ;
const { pineconeIndex } = await this . connect ( ) ;
2023-06-08 06:31:35 +02:00
if ( ! ( await this . namespaceExists ( pineconeIndex , namespace ) ) )
throw new Error (
"Invalid namespace - has it been collected and seeded yet?"
) ;
2023-06-04 04:28:07 +02:00
2023-06-08 06:31:35 +02:00
const vectorStore = await PineconeStore . fromExistingIndex ( this . embedder ( ) , {
pineconeIndex ,
namespace ,
} ) ;
2023-06-04 04:28:07 +02:00
const memory = new VectorStoreRetrieverMemory ( {
vectorStoreRetriever : vectorStore . asRetriever ( 1 ) ,
memoryKey : "history" ,
} ) ;
2023-06-15 08:12:59 +02:00
const model = this . llm ( {
temperature : workspace ? . openAiTemp ,
} ) ;
2023-06-04 04:28:07 +02:00
const prompt =
PromptTemplate . fromTemplate ( ` The following is a friendly conversation between a human and an AI. The AI is very casual and talkative and responds with a friendly tone. If the AI does not know the answer to a question, it truthfully says it does not know.
Relevant pieces of previous conversation :
{ history }
Current conversation :
Human : { input }
AI : ` );
const chain = new LLMChain ( { llm : model , prompt , memory } ) ;
const response = await chain . call ( { input } ) ;
2023-06-08 06:31:35 +02:00
return { response : response . text , sources : [ ] , message : false } ;
2023-06-04 04:28:07 +02:00
} ,
2023-06-08 06:31:35 +02:00
} ;
2023-06-04 04:28:07 +02:00
2023-06-08 22:13:48 +02:00
module . exports . Pinecone = Pinecone ;