const { v4: uuidv4 } = require("uuid"); const { DocumentManager } = require("../DocumentManager"); const { WorkspaceChats } = require("../../models/workspaceChats"); const { getVectorDbClass, getLLMProvider } = require("../helpers"); const { writeResponseChunk } = require("../helpers/chat/responses"); const { chatPrompt, sourceIdentifier, recentChatHistory } = require("./index"); /** * @typedef ResponseObject * @property {string} id - uuid of response * @property {string} type - Type of response * @property {string|null} textResponse - full text response * @property {object[]} sources * @property {boolean} close * @property {string|null} error */ /** * Handle synchronous chats with your workspace via the developer API endpoint * @param {{ * workspace: import("@prisma/client").workspaces, * message:string, * mode: "chat"|"query", * user: import("@prisma/client").users|null, * thread: import("@prisma/client").workspace_threads|null, * sessionId: string|null, * }} parameters * @returns {Promise} */ async function chatSync({ workspace, message = null, mode = "chat", user = null, thread = null, sessionId = null, }) { const uuid = uuidv4(); const chatMode = mode ?? "chat"; const LLMConnector = getLLMProvider({ provider: workspace?.chatProvider, model: workspace?.chatModel, }); const VectorDb = getVectorDbClass(); const messageLimit = workspace?.openAiHistory || 20; const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug); const embeddingsCount = await VectorDb.namespaceCount(workspace.slug); // User is trying to query-mode chat a workspace that has no data in it - so // we should exit early as no information can be found under these conditions. if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") { const textResponse = workspace?.queryRefusalResponse ?? "There is no relevant information in this workspace to answer your query."; await WorkspaceChats.new({ workspaceId: workspace.id, prompt: String(message), response: { text: textResponse, sources: [], type: chatMode, }, include: false, apiSessionId: sessionId, }); return { id: uuid, type: "textResponse", sources: [], close: true, error: null, textResponse, }; } // If we are here we know that we are in a workspace that is: // 1. Chatting in "chat" mode and may or may _not_ have embeddings // 2. Chatting in "query" mode and has at least 1 embedding let contextTexts = []; let sources = []; let pinnedDocIdentifiers = []; const { rawHistory, chatHistory } = await recentChatHistory({ user, workspace, thread, messageLimit, apiSessionId: sessionId, }); await new DocumentManager({ workspace, maxTokens: LLMConnector.promptWindowLimit(), }) .pinnedDocs() .then((pinnedDocs) => { pinnedDocs.forEach((doc) => { const { pageContent, ...metadata } = doc; pinnedDocIdentifiers.push(sourceIdentifier(doc)); contextTexts.push(doc.pageContent); sources.push({ text: pageContent.slice(0, 1_000) + "...continued on in source document...", ...metadata, }); }); }); const vectorSearchResults = embeddingsCount !== 0 ? await VectorDb.performSimilaritySearch({ namespace: workspace.slug, input: message, LLMConnector, similarityThreshold: workspace?.similarityThreshold, topN: workspace?.topN, filterIdentifiers: pinnedDocIdentifiers, }) : { contextTexts: [], sources: [], message: null, }; // Failed similarity search if it was run at all and failed. if (!!vectorSearchResults.message) { return { id: uuid, type: "abort", textResponse: null, sources: [], close: true, error: vectorSearchResults.message, }; } const { fillSourceWindow } = require("../helpers/chat"); const filledSources = fillSourceWindow({ nDocs: workspace?.topN || 4, searchResults: vectorSearchResults.sources, history: rawHistory, filterIdentifiers: pinnedDocIdentifiers, }); // Why does contextTexts get all the info, but sources only get current search? // This is to give the ability of the LLM to "comprehend" a contextual response without // populating the Citations under a response with documents the user "thinks" are irrelevant // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses. // If a past citation was used to answer the question - that is visible in the history so it logically makes sense // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt. // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate. contextTexts = [...contextTexts, ...filledSources.contextTexts]; sources = [...sources, ...vectorSearchResults.sources]; // If in query mode and no context chunks are found from search, backfill, or pins - do not // let the LLM try to hallucinate a response or use general knowledge and exit early if (chatMode === "query" && contextTexts.length === 0) { const textResponse = workspace?.queryRefusalResponse ?? "There is no relevant information in this workspace to answer your query."; await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: { text: textResponse, sources: [], type: chatMode, }, threadId: thread?.id || null, include: false, apiSessionId: sessionId, user, }); return { id: uuid, type: "textResponse", sources: [], close: true, error: null, textResponse, }; } // Compress & Assemble message to ensure prompt passes token limit with room for response // and build system messages based on inputs and history. const messages = await LLMConnector.compressMessages( { systemPrompt: chatPrompt(workspace), userPrompt: message, contextTexts, chatHistory, }, rawHistory ); // Send the text completion. const textResponse = await LLMConnector.getChatCompletion(messages, { temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp, }); if (!textResponse) { return { id: uuid, type: "abort", textResponse: null, sources: [], close: true, error: "No text completion could be completed with this input.", }; } const { chat } = await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: { text: textResponse, sources, type: chatMode }, threadId: thread?.id || null, apiSessionId: sessionId, user, }); return { id: uuid, type: "textResponse", close: true, error: null, chatId: chat.id, textResponse, sources, }; } /** * Handle streamable HTTP chunks for chats with your workspace via the developer API endpoint * @param {{ * response: import("express").Response, * workspace: import("@prisma/client").workspaces, * message:string, * mode: "chat"|"query", * user: import("@prisma/client").users|null, * thread: import("@prisma/client").workspace_threads|null, * sessionId: string|null, * }} parameters * @returns {Promise} */ async function streamChat({ response, workspace, message = null, mode = "chat", user = null, thread = null, sessionId = null, }) { const uuid = uuidv4(); const chatMode = mode ?? "chat"; const LLMConnector = getLLMProvider({ provider: workspace?.chatProvider, model: workspace?.chatModel, }); const VectorDb = getVectorDbClass(); const messageLimit = workspace?.openAiHistory || 20; const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug); const embeddingsCount = await VectorDb.namespaceCount(workspace.slug); // User is trying to query-mode chat a workspace that has no data in it - so // we should exit early as no information can be found under these conditions. if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") { const textResponse = workspace?.queryRefusalResponse ?? "There is no relevant information in this workspace to answer your query."; writeResponseChunk(response, { id: uuid, type: "textResponse", textResponse, sources: [], attachments: [], close: true, error: null, }); await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: { text: textResponse, sources: [], type: chatMode, attachments: [], }, threadId: thread?.id || null, apiSessionId: sessionId, include: false, user, }); return; } // If we are here we know that we are in a workspace that is: // 1. Chatting in "chat" mode and may or may _not_ have embeddings // 2. Chatting in "query" mode and has at least 1 embedding let completeText; let contextTexts = []; let sources = []; let pinnedDocIdentifiers = []; const { rawHistory, chatHistory } = await recentChatHistory({ user, workspace, thread, messageLimit, apiSessionId: sessionId, }); // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search // as pinning is a supplemental tool but it should be used with caution since it can easily blow up a context window. // However we limit the maximum of appended context to 80% of its overall size, mostly because if it expands beyond this // it will undergo prompt compression anyway to make it work. If there is so much pinned that the context here is bigger than // what the model can support - it would get compressed anyway and that really is not the point of pinning. It is really best // suited for high-context models. await new DocumentManager({ workspace, maxTokens: LLMConnector.promptWindowLimit(), }) .pinnedDocs() .then((pinnedDocs) => { pinnedDocs.forEach((doc) => { const { pageContent, ...metadata } = doc; pinnedDocIdentifiers.push(sourceIdentifier(doc)); contextTexts.push(doc.pageContent); sources.push({ text: pageContent.slice(0, 1_000) + "...continued on in source document...", ...metadata, }); }); }); const vectorSearchResults = embeddingsCount !== 0 ? await VectorDb.performSimilaritySearch({ namespace: workspace.slug, input: message, LLMConnector, similarityThreshold: workspace?.similarityThreshold, topN: workspace?.topN, filterIdentifiers: pinnedDocIdentifiers, }) : { contextTexts: [], sources: [], message: null, }; // Failed similarity search if it was run at all and failed. if (!!vectorSearchResults.message) { writeResponseChunk(response, { id: uuid, type: "abort", textResponse: null, sources: [], close: true, error: vectorSearchResults.message, }); return; } const { fillSourceWindow } = require("../helpers/chat"); const filledSources = fillSourceWindow({ nDocs: workspace?.topN || 4, searchResults: vectorSearchResults.sources, history: rawHistory, filterIdentifiers: pinnedDocIdentifiers, }); // Why does contextTexts get all the info, but sources only get current search? // This is to give the ability of the LLM to "comprehend" a contextual response without // populating the Citations under a response with documents the user "thinks" are irrelevant // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses. // If a past citation was used to answer the question - that is visible in the history so it logically makes sense // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt. // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate. contextTexts = [...contextTexts, ...filledSources.contextTexts]; sources = [...sources, ...vectorSearchResults.sources]; // If in query mode and no context chunks are found from search, backfill, or pins - do not // let the LLM try to hallucinate a response or use general knowledge and exit early if (chatMode === "query" && contextTexts.length === 0) { const textResponse = workspace?.queryRefusalResponse ?? "There is no relevant information in this workspace to answer your query."; writeResponseChunk(response, { id: uuid, type: "textResponse", textResponse, sources: [], close: true, error: null, }); await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: { text: textResponse, sources: [], type: chatMode, attachments: [], }, threadId: thread?.id || null, apiSessionId: sessionId, include: false, user, }); return; } // Compress & Assemble message to ensure prompt passes token limit with room for response // and build system messages based on inputs and history. const messages = await LLMConnector.compressMessages( { systemPrompt: chatPrompt(workspace), userPrompt: message, contextTexts, chatHistory, }, rawHistory ); // If streaming is not explicitly enabled for connector // we do regular waiting of a response and send a single chunk. if (LLMConnector.streamingEnabled() !== true) { console.log( `\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.` ); completeText = await LLMConnector.getChatCompletion(messages, { temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp, }); writeResponseChunk(response, { uuid, sources, type: "textResponseChunk", textResponse: completeText, close: true, error: false, }); } else { const stream = await LLMConnector.streamGetChatCompletion(messages, { temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp, }); completeText = await LLMConnector.handleStream(response, stream, { uuid, sources, }); } if (completeText?.length > 0) { const { chat } = await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: { text: completeText, sources, type: chatMode }, threadId: thread?.id || null, apiSessionId: sessionId, user, }); writeResponseChunk(response, { uuid, type: "finalizeResponseStream", close: true, error: false, chatId: chat.id, }); return; } writeResponseChunk(response, { uuid, type: "finalizeResponseStream", close: true, error: false, }); return; } module.exports.ApiChatHandler = { chatSync, streamChat, };