const { v4: uuidv4 } = require("uuid"); const { getVectorDbClass, getLLMProvider } = require("../helpers"); const { chatPrompt, sourceIdentifier } = require("./index"); const { EmbedChats } = require("../../models/embedChats"); const { convertToPromptHistory, writeResponseChunk, } = require("../helpers/chat/responses"); const { DocumentManager } = require("../DocumentManager"); async function streamChatWithForEmbed( response, /** @type {import("@prisma/client").embed_configs & {workspace?: import("@prisma/client").workspaces}} */ embed, /** @type {String} */ message, /** @type {String} */ sessionId, { promptOverride, modelOverride, temperatureOverride } ) { const chatMode = embed.chat_mode; const chatModel = embed.allow_model_override ? modelOverride : null; // If there are overrides in request & they are permitted, override the default workspace ref information. if (embed.allow_prompt_override) embed.workspace.openAiPrompt = promptOverride; if (embed.allow_temperature_override) embed.workspace.openAiTemp = parseFloat(temperatureOverride); const uuid = uuidv4(); const LLMConnector = getLLMProvider({ provider: embed?.workspace?.chatProvider, model: chatModel ?? embed.workspace?.chatModel, }); const VectorDb = getVectorDbClass(); const messageLimit = 20; const hasVectorizedSpace = await VectorDb.hasNamespace(embed.workspace.slug); const embeddingsCount = await VectorDb.namespaceCount(embed.workspace.slug); // User is trying to query-mode chat a workspace that has no data in it - so // we should exit early as no information can be found under these conditions. if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") { writeResponseChunk(response, { id: uuid, type: "textResponse", textResponse: "I do not have enough information to answer that. Try another question.", sources: [], close: true, error: null, }); return; } let completeText; let contextTexts = []; let sources = []; let pinnedDocIdentifiers = []; const { rawHistory, chatHistory } = await recentEmbedChatHistory( sessionId, embed, messageLimit, chatMode ); // See stream.js comment for more information on this implementation. await new DocumentManager({ workspace: embed.workspace, maxTokens: LLMConnector.promptWindowLimit(), }) .pinnedDocs() .then((pinnedDocs) => { pinnedDocs.forEach((doc) => { const { pageContent, ...metadata } = doc; pinnedDocIdentifiers.push(sourceIdentifier(doc)); contextTexts.push(doc.pageContent); sources.push({ text: pageContent.slice(0, 1_000) + "...continued on in source document...", ...metadata, }); }); }); const vectorSearchResults = embeddingsCount !== 0 ? await VectorDb.performSimilaritySearch({ namespace: embed.workspace.slug, input: message, LLMConnector, similarityThreshold: embed.workspace?.similarityThreshold, topN: embed.workspace?.topN, filterIdentifiers: pinnedDocIdentifiers, }) : { contextTexts: [], sources: [], message: null, }; // Failed similarity search if it was run at all and failed. if (!!vectorSearchResults.message) { writeResponseChunk(response, { id: uuid, type: "abort", textResponse: null, sources: [], close: true, error: "Failed to connect to vector database provider.", }); return; } contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts]; sources = [...sources, ...vectorSearchResults.sources]; // If in query mode and no sources are found, do not // let the LLM try to hallucinate a response or use general knowledge if ( chatMode === "query" && sources.length === 0 && pinnedDocIdentifiers.length === 0 ) { writeResponseChunk(response, { id: uuid, type: "textResponse", textResponse: embed.workspace?.queryRefusalResponse ?? "There is no relevant information in this workspace to answer your query.", sources: [], close: true, error: null, }); return; } // Compress message to ensure prompt passes token limit with room for response // and build system messages based on inputs and history. const messages = await LLMConnector.compressMessages( { systemPrompt: chatPrompt(embed.workspace), userPrompt: message, contextTexts, chatHistory, }, rawHistory ); // If streaming is not explicitly enabled for connector // we do regular waiting of a response and send a single chunk. if (LLMConnector.streamingEnabled() !== true) { console.log( `\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.` ); completeText = await LLMConnector.getChatCompletion(messages, { temperature: embed.workspace?.openAiTemp ?? LLMConnector.defaultTemp, }); writeResponseChunk(response, { uuid, sources: [], type: "textResponseChunk", textResponse: completeText, close: true, error: false, }); } else { const stream = await LLMConnector.streamGetChatCompletion(messages, { temperature: embed.workspace?.openAiTemp ?? LLMConnector.defaultTemp, }); completeText = await LLMConnector.handleStream(response, stream, { uuid, sources: [], }); } await EmbedChats.new({ embedId: embed.id, prompt: message, response: { text: completeText, type: chatMode }, connection_information: response.locals.connection ? { ...response.locals.connection } : {}, sessionId, }); return; } // On query we don't return message history. All other chat modes and when chatting // with no embeddings we return history. async function recentEmbedChatHistory( sessionId, embed, messageLimit = 20, chatMode = null ) { if (chatMode === "query") return { rawHistory: [], chatHistory: [] }; const rawHistory = ( await EmbedChats.forEmbedByUser(embed.id, sessionId, messageLimit, { id: "desc", }) ).reverse(); return { rawHistory, chatHistory: convertToPromptHistory(rawHistory) }; } module.exports = { streamChatWithForEmbed, };