mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 04:30:10 +01:00
Refactor api endpoint chat handler to its own function (#2157)
remove legacy `chatWithWorkspace` and cleanup `index.js`
This commit is contained in:
parent
f7756d4758
commit
1f96b837b3
1
.vscode/settings.json
vendored
1
.vscode/settings.json
vendored
@ -43,6 +43,7 @@
|
||||
"searxng",
|
||||
"Serper",
|
||||
"Serply",
|
||||
"streamable",
|
||||
"textgenwebui",
|
||||
"togetherai",
|
||||
"Unembed",
|
||||
|
@ -4,19 +4,16 @@ const { Telemetry } = require("../../../models/telemetry");
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { Workspace } = require("../../../models/workspace");
|
||||
const { WorkspaceChats } = require("../../../models/workspaceChats");
|
||||
const { chatWithWorkspace } = require("../../../utils/chats");
|
||||
const { getVectorDbClass } = require("../../../utils/helpers");
|
||||
const { multiUserMode, reqBody } = require("../../../utils/http");
|
||||
const { validApiKey } = require("../../../utils/middleware/validApiKey");
|
||||
const {
|
||||
streamChatWithWorkspace,
|
||||
VALID_CHAT_MODE,
|
||||
} = require("../../../utils/chats/stream");
|
||||
const { VALID_CHAT_MODE } = require("../../../utils/chats/stream");
|
||||
const { EventLogs } = require("../../../models/eventLogs");
|
||||
const {
|
||||
convertToChatHistory,
|
||||
writeResponseChunk,
|
||||
} = require("../../../utils/helpers/chat/responses");
|
||||
const { ApiChatHandler } = require("../../../utils/chats/apiChatHandler");
|
||||
|
||||
function apiWorkspaceEndpoints(app) {
|
||||
if (!app) return;
|
||||
@ -584,7 +581,7 @@ function apiWorkspaceEndpoints(app) {
|
||||
try {
|
||||
const { slug } = request.params;
|
||||
const { message, mode = "query" } = reqBody(request);
|
||||
const workspace = await Workspace.get({ slug });
|
||||
const workspace = await Workspace.get({ slug: String(slug) });
|
||||
|
||||
if (!workspace) {
|
||||
response.status(400).json({
|
||||
@ -612,9 +609,17 @@ function apiWorkspaceEndpoints(app) {
|
||||
return;
|
||||
}
|
||||
|
||||
const result = await chatWithWorkspace(workspace, message, mode);
|
||||
const result = await ApiChatHandler.chatSync({
|
||||
workspace,
|
||||
message,
|
||||
mode,
|
||||
user: null,
|
||||
thread: null,
|
||||
});
|
||||
|
||||
await Telemetry.sendTelemetry("sent_chat", {
|
||||
LLMSelection: process.env.LLM_PROVIDER || "openai",
|
||||
LLMSelection:
|
||||
workspace.chatProvider ?? process.env.LLM_PROVIDER ?? "openai",
|
||||
Embedder: process.env.EMBEDDING_ENGINE || "inherit",
|
||||
VectorDbSelection: process.env.VECTOR_DB || "lancedb",
|
||||
TTSSelection: process.env.TTS_PROVIDER || "native",
|
||||
@ -623,7 +628,7 @@ function apiWorkspaceEndpoints(app) {
|
||||
workspaceName: workspace?.name,
|
||||
chatModel: workspace?.chatModel || "System Default",
|
||||
});
|
||||
response.status(200).json({ ...result });
|
||||
return response.status(200).json({ ...result });
|
||||
} catch (e) {
|
||||
console.error(e.message, e);
|
||||
response.status(500).json({
|
||||
@ -702,7 +707,7 @@ function apiWorkspaceEndpoints(app) {
|
||||
try {
|
||||
const { slug } = request.params;
|
||||
const { message, mode = "query" } = reqBody(request);
|
||||
const workspace = await Workspace.get({ slug });
|
||||
const workspace = await Workspace.get({ slug: String(slug) });
|
||||
|
||||
if (!workspace) {
|
||||
response.status(400).json({
|
||||
@ -736,9 +741,17 @@ function apiWorkspaceEndpoints(app) {
|
||||
response.setHeader("Connection", "keep-alive");
|
||||
response.flushHeaders();
|
||||
|
||||
await streamChatWithWorkspace(response, workspace, message, mode);
|
||||
await ApiChatHandler.streamChat({
|
||||
response,
|
||||
workspace,
|
||||
message,
|
||||
mode,
|
||||
user: null,
|
||||
thread: null,
|
||||
});
|
||||
await Telemetry.sendTelemetry("sent_chat", {
|
||||
LLMSelection: process.env.LLM_PROVIDER || "openai",
|
||||
LLMSelection:
|
||||
workspace.chatProvider ?? process.env.LLM_PROVIDER ?? "openai",
|
||||
Embedder: process.env.EMBEDDING_ENGINE || "inherit",
|
||||
VectorDbSelection: process.env.VECTOR_DB || "lancedb",
|
||||
TTSSelection: process.env.TTS_PROVIDER || "native",
|
||||
|
@ -3,7 +3,6 @@ const { WorkspaceThread } = require("../../../models/workspaceThread");
|
||||
const { Workspace } = require("../../../models/workspace");
|
||||
const { validApiKey } = require("../../../utils/middleware/validApiKey");
|
||||
const { reqBody, multiUserMode } = require("../../../utils/http");
|
||||
const { chatWithWorkspace } = require("../../../utils/chats");
|
||||
const {
|
||||
streamChatWithWorkspace,
|
||||
VALID_CHAT_MODE,
|
||||
@ -16,6 +15,7 @@ const {
|
||||
} = require("../../../utils/helpers/chat/responses");
|
||||
const { WorkspaceChats } = require("../../../models/workspaceChats");
|
||||
const { User } = require("../../../models/user");
|
||||
const { ApiChatHandler } = require("../../../utils/chats/apiChatHandler");
|
||||
|
||||
function apiWorkspaceThreadEndpoints(app) {
|
||||
if (!app) return;
|
||||
@ -405,13 +405,13 @@ function apiWorkspaceThreadEndpoints(app) {
|
||||
}
|
||||
|
||||
const user = userId ? await User.get({ id: Number(userId) }) : null;
|
||||
const result = await chatWithWorkspace(
|
||||
const result = await ApiChatHandler.chatSync({
|
||||
workspace,
|
||||
message,
|
||||
mode,
|
||||
user,
|
||||
thread
|
||||
);
|
||||
thread,
|
||||
});
|
||||
await Telemetry.sendTelemetry("sent_chat", {
|
||||
LLMSelection: process.env.LLM_PROVIDER || "openai",
|
||||
Embedder: process.env.EMBEDDING_ENGINE || "inherit",
|
||||
@ -556,14 +556,14 @@ function apiWorkspaceThreadEndpoints(app) {
|
||||
response.setHeader("Connection", "keep-alive");
|
||||
response.flushHeaders();
|
||||
|
||||
await streamChatWithWorkspace(
|
||||
await ApiChatHandler.streamChat({
|
||||
response,
|
||||
workspace,
|
||||
message,
|
||||
mode,
|
||||
user,
|
||||
thread
|
||||
);
|
||||
thread,
|
||||
});
|
||||
await Telemetry.sendTelemetry("sent_chat", {
|
||||
LLMSelection: process.env.LLM_PROVIDER || "openai",
|
||||
Embedder: process.env.EMBEDDING_ENGINE || "inherit",
|
||||
|
481
server/utils/chats/apiChatHandler.js
Normal file
481
server/utils/chats/apiChatHandler.js
Normal file
@ -0,0 +1,481 @@
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { DocumentManager } = require("../DocumentManager");
|
||||
const { WorkspaceChats } = require("../../models/workspaceChats");
|
||||
const { getVectorDbClass, getLLMProvider } = require("../helpers");
|
||||
const { writeResponseChunk } = require("../helpers/chat/responses");
|
||||
const { chatPrompt, sourceIdentifier, recentChatHistory } = require("./index");
|
||||
|
||||
/**
|
||||
* @typedef ResponseObject
|
||||
* @property {string} id - uuid of response
|
||||
* @property {string} type - Type of response
|
||||
* @property {string|null} textResponse - full text response
|
||||
* @property {object[]} sources
|
||||
* @property {boolean} close
|
||||
* @property {string|null} error
|
||||
*/
|
||||
|
||||
/**
|
||||
* Handle synchronous chats with your workspace via the developer API endpoint
|
||||
* @param {{
|
||||
* workspace: import("@prisma/client").workspaces,
|
||||
* message:string,
|
||||
* mode: "chat"|"query",
|
||||
* user: import("@prisma/client").users|null,
|
||||
* thread: import("@prisma/client").workspace_threads|null,
|
||||
* }} parameters
|
||||
* @returns {Promise<ResponseObject>}
|
||||
*/
|
||||
async function chatSync({
|
||||
workspace,
|
||||
message = null,
|
||||
mode = "chat",
|
||||
user = null,
|
||||
thread = null,
|
||||
}) {
|
||||
const uuid = uuidv4();
|
||||
const chatMode = mode ?? "chat";
|
||||
const LLMConnector = getLLMProvider({
|
||||
provider: workspace?.chatProvider,
|
||||
model: workspace?.chatModel,
|
||||
});
|
||||
const VectorDb = getVectorDbClass();
|
||||
const messageLimit = workspace?.openAiHistory || 20;
|
||||
const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
|
||||
const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
|
||||
|
||||
// User is trying to query-mode chat a workspace that has no data in it - so
|
||||
// we should exit early as no information can be found under these conditions.
|
||||
if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
|
||||
const textResponse =
|
||||
workspace?.queryRefusalResponse ??
|
||||
"There is no relevant information in this workspace to answer your query.";
|
||||
|
||||
await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: String(message),
|
||||
response: {
|
||||
text: textResponse,
|
||||
sources: [],
|
||||
type: chatMode,
|
||||
},
|
||||
include: false,
|
||||
});
|
||||
|
||||
return {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
sources: [],
|
||||
close: true,
|
||||
error: null,
|
||||
textResponse,
|
||||
};
|
||||
}
|
||||
|
||||
// If we are here we know that we are in a workspace that is:
|
||||
// 1. Chatting in "chat" mode and may or may _not_ have embeddings
|
||||
// 2. Chatting in "query" mode and has at least 1 embedding
|
||||
let contextTexts = [];
|
||||
let sources = [];
|
||||
let pinnedDocIdentifiers = [];
|
||||
const { rawHistory, chatHistory } = await recentChatHistory({
|
||||
user,
|
||||
workspace,
|
||||
thread,
|
||||
messageLimit,
|
||||
chatMode,
|
||||
});
|
||||
|
||||
await new DocumentManager({
|
||||
workspace,
|
||||
maxTokens: LLMConnector.promptWindowLimit(),
|
||||
})
|
||||
.pinnedDocs()
|
||||
.then((pinnedDocs) => {
|
||||
pinnedDocs.forEach((doc) => {
|
||||
const { pageContent, ...metadata } = doc;
|
||||
pinnedDocIdentifiers.push(sourceIdentifier(doc));
|
||||
contextTexts.push(doc.pageContent);
|
||||
sources.push({
|
||||
text:
|
||||
pageContent.slice(0, 1_000) +
|
||||
"...continued on in source document...",
|
||||
...metadata,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
const vectorSearchResults =
|
||||
embeddingsCount !== 0
|
||||
? await VectorDb.performSimilaritySearch({
|
||||
namespace: workspace.slug,
|
||||
input: message,
|
||||
LLMConnector,
|
||||
similarityThreshold: workspace?.similarityThreshold,
|
||||
topN: workspace?.topN,
|
||||
filterIdentifiers: pinnedDocIdentifiers,
|
||||
})
|
||||
: {
|
||||
contextTexts: [],
|
||||
sources: [],
|
||||
message: null,
|
||||
};
|
||||
|
||||
// Failed similarity search if it was run at all and failed.
|
||||
if (!!vectorSearchResults.message) {
|
||||
return {
|
||||
id: uuid,
|
||||
type: "abort",
|
||||
textResponse: null,
|
||||
sources: [],
|
||||
close: true,
|
||||
error: vectorSearchResults.message,
|
||||
};
|
||||
}
|
||||
|
||||
const { fillSourceWindow } = require("../helpers/chat");
|
||||
const filledSources = fillSourceWindow({
|
||||
nDocs: workspace?.topN || 4,
|
||||
searchResults: vectorSearchResults.sources,
|
||||
history: rawHistory,
|
||||
filterIdentifiers: pinnedDocIdentifiers,
|
||||
});
|
||||
|
||||
// Why does contextTexts get all the info, but sources only get current search?
|
||||
// This is to give the ability of the LLM to "comprehend" a contextual response without
|
||||
// populating the Citations under a response with documents the user "thinks" are irrelevant
|
||||
// due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
|
||||
// If a past citation was used to answer the question - that is visible in the history so it logically makes sense
|
||||
// and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
|
||||
// TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
|
||||
contextTexts = [...contextTexts, ...filledSources.contextTexts];
|
||||
sources = [...sources, ...vectorSearchResults.sources];
|
||||
|
||||
// If in query mode and no context chunks are found from search, backfill, or pins - do not
|
||||
// let the LLM try to hallucinate a response or use general knowledge and exit early
|
||||
if (chatMode === "query" && contextTexts.length === 0) {
|
||||
const textResponse =
|
||||
workspace?.queryRefusalResponse ??
|
||||
"There is no relevant information in this workspace to answer your query.";
|
||||
|
||||
await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: {
|
||||
text: textResponse,
|
||||
sources: [],
|
||||
type: chatMode,
|
||||
},
|
||||
threadId: thread?.id || null,
|
||||
include: false,
|
||||
user,
|
||||
});
|
||||
|
||||
return {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
sources: [],
|
||||
close: true,
|
||||
error: null,
|
||||
textResponse,
|
||||
};
|
||||
}
|
||||
|
||||
// Compress & Assemble message to ensure prompt passes token limit with room for response
|
||||
// and build system messages based on inputs and history.
|
||||
const messages = await LLMConnector.compressMessages(
|
||||
{
|
||||
systemPrompt: chatPrompt(workspace),
|
||||
userPrompt: message,
|
||||
contextTexts,
|
||||
chatHistory,
|
||||
},
|
||||
rawHistory
|
||||
);
|
||||
|
||||
// Send the text completion.
|
||||
const textResponse = await LLMConnector.getChatCompletion(messages, {
|
||||
temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
|
||||
});
|
||||
|
||||
if (!textResponse) {
|
||||
return {
|
||||
id: uuid,
|
||||
type: "abort",
|
||||
textResponse: null,
|
||||
sources: [],
|
||||
close: true,
|
||||
error: "No text completion could be completed with this input.",
|
||||
};
|
||||
}
|
||||
|
||||
const { chat } = await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: { text: textResponse, sources, type: chatMode },
|
||||
threadId: thread?.id || null,
|
||||
user,
|
||||
});
|
||||
|
||||
return {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
close: true,
|
||||
error: null,
|
||||
chatId: chat.id,
|
||||
textResponse,
|
||||
sources,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle streamable HTTP chunks for chats with your workspace via the developer API endpoint
|
||||
* @param {{
|
||||
* response: import("express").Response,
|
||||
* workspace: import("@prisma/client").workspaces,
|
||||
* message:string,
|
||||
* mode: "chat"|"query",
|
||||
* user: import("@prisma/client").users|null,
|
||||
* thread: import("@prisma/client").workspace_threads|null,
|
||||
* }} parameters
|
||||
* @returns {Promise<VoidFunction>}
|
||||
*/
|
||||
async function streamChat({
|
||||
response,
|
||||
workspace,
|
||||
message = null,
|
||||
mode = "chat",
|
||||
user = null,
|
||||
thread = null,
|
||||
}) {
|
||||
const uuid = uuidv4();
|
||||
const chatMode = mode ?? "chat";
|
||||
const LLMConnector = getLLMProvider({
|
||||
provider: workspace?.chatProvider,
|
||||
model: workspace?.chatModel,
|
||||
});
|
||||
|
||||
const VectorDb = getVectorDbClass();
|
||||
const messageLimit = workspace?.openAiHistory || 20;
|
||||
const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
|
||||
const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
|
||||
|
||||
// User is trying to query-mode chat a workspace that has no data in it - so
|
||||
// we should exit early as no information can be found under these conditions.
|
||||
if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
|
||||
const textResponse =
|
||||
workspace?.queryRefusalResponse ??
|
||||
"There is no relevant information in this workspace to answer your query.";
|
||||
writeResponseChunk(response, {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
textResponse,
|
||||
sources: [],
|
||||
attachments: [],
|
||||
close: true,
|
||||
error: null,
|
||||
});
|
||||
await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: {
|
||||
text: textResponse,
|
||||
sources: [],
|
||||
type: chatMode,
|
||||
attachments: [],
|
||||
},
|
||||
threadId: thread?.id || null,
|
||||
include: false,
|
||||
user,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// If we are here we know that we are in a workspace that is:
|
||||
// 1. Chatting in "chat" mode and may or may _not_ have embeddings
|
||||
// 2. Chatting in "query" mode and has at least 1 embedding
|
||||
let completeText;
|
||||
let contextTexts = [];
|
||||
let sources = [];
|
||||
let pinnedDocIdentifiers = [];
|
||||
const { rawHistory, chatHistory } = await recentChatHistory({
|
||||
user,
|
||||
workspace,
|
||||
thread,
|
||||
messageLimit,
|
||||
});
|
||||
|
||||
// Look for pinned documents and see if the user decided to use this feature. We will also do a vector search
|
||||
// as pinning is a supplemental tool but it should be used with caution since it can easily blow up a context window.
|
||||
// However we limit the maximum of appended context to 80% of its overall size, mostly because if it expands beyond this
|
||||
// it will undergo prompt compression anyway to make it work. If there is so much pinned that the context here is bigger than
|
||||
// what the model can support - it would get compressed anyway and that really is not the point of pinning. It is really best
|
||||
// suited for high-context models.
|
||||
await new DocumentManager({
|
||||
workspace,
|
||||
maxTokens: LLMConnector.promptWindowLimit(),
|
||||
})
|
||||
.pinnedDocs()
|
||||
.then((pinnedDocs) => {
|
||||
pinnedDocs.forEach((doc) => {
|
||||
const { pageContent, ...metadata } = doc;
|
||||
pinnedDocIdentifiers.push(sourceIdentifier(doc));
|
||||
contextTexts.push(doc.pageContent);
|
||||
sources.push({
|
||||
text:
|
||||
pageContent.slice(0, 1_000) +
|
||||
"...continued on in source document...",
|
||||
...metadata,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
const vectorSearchResults =
|
||||
embeddingsCount !== 0
|
||||
? await VectorDb.performSimilaritySearch({
|
||||
namespace: workspace.slug,
|
||||
input: message,
|
||||
LLMConnector,
|
||||
similarityThreshold: workspace?.similarityThreshold,
|
||||
topN: workspace?.topN,
|
||||
filterIdentifiers: pinnedDocIdentifiers,
|
||||
})
|
||||
: {
|
||||
contextTexts: [],
|
||||
sources: [],
|
||||
message: null,
|
||||
};
|
||||
|
||||
// Failed similarity search if it was run at all and failed.
|
||||
if (!!vectorSearchResults.message) {
|
||||
writeResponseChunk(response, {
|
||||
id: uuid,
|
||||
type: "abort",
|
||||
textResponse: null,
|
||||
sources: [],
|
||||
close: true,
|
||||
error: vectorSearchResults.message,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const { fillSourceWindow } = require("../helpers/chat");
|
||||
const filledSources = fillSourceWindow({
|
||||
nDocs: workspace?.topN || 4,
|
||||
searchResults: vectorSearchResults.sources,
|
||||
history: rawHistory,
|
||||
filterIdentifiers: pinnedDocIdentifiers,
|
||||
});
|
||||
|
||||
// Why does contextTexts get all the info, but sources only get current search?
|
||||
// This is to give the ability of the LLM to "comprehend" a contextual response without
|
||||
// populating the Citations under a response with documents the user "thinks" are irrelevant
|
||||
// due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
|
||||
// If a past citation was used to answer the question - that is visible in the history so it logically makes sense
|
||||
// and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
|
||||
// TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
|
||||
contextTexts = [...contextTexts, ...filledSources.contextTexts];
|
||||
sources = [...sources, ...vectorSearchResults.sources];
|
||||
|
||||
// If in query mode and no context chunks are found from search, backfill, or pins - do not
|
||||
// let the LLM try to hallucinate a response or use general knowledge and exit early
|
||||
if (chatMode === "query" && contextTexts.length === 0) {
|
||||
const textResponse =
|
||||
workspace?.queryRefusalResponse ??
|
||||
"There is no relevant information in this workspace to answer your query.";
|
||||
writeResponseChunk(response, {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
textResponse,
|
||||
sources: [],
|
||||
close: true,
|
||||
error: null,
|
||||
});
|
||||
|
||||
await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: {
|
||||
text: textResponse,
|
||||
sources: [],
|
||||
type: chatMode,
|
||||
attachments: [],
|
||||
},
|
||||
threadId: thread?.id || null,
|
||||
include: false,
|
||||
user,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Compress & Assemble message to ensure prompt passes token limit with room for response
|
||||
// and build system messages based on inputs and history.
|
||||
const messages = await LLMConnector.compressMessages(
|
||||
{
|
||||
systemPrompt: chatPrompt(workspace),
|
||||
userPrompt: message,
|
||||
contextTexts,
|
||||
chatHistory,
|
||||
},
|
||||
rawHistory
|
||||
);
|
||||
|
||||
// If streaming is not explicitly enabled for connector
|
||||
// we do regular waiting of a response and send a single chunk.
|
||||
if (LLMConnector.streamingEnabled() !== true) {
|
||||
console.log(
|
||||
`\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
|
||||
);
|
||||
completeText = await LLMConnector.getChatCompletion(messages, {
|
||||
temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
|
||||
});
|
||||
writeResponseChunk(response, {
|
||||
uuid,
|
||||
sources,
|
||||
type: "textResponseChunk",
|
||||
textResponse: completeText,
|
||||
close: true,
|
||||
error: false,
|
||||
});
|
||||
} else {
|
||||
const stream = await LLMConnector.streamGetChatCompletion(messages, {
|
||||
temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
|
||||
});
|
||||
completeText = await LLMConnector.handleStream(response, stream, {
|
||||
uuid,
|
||||
sources,
|
||||
});
|
||||
}
|
||||
|
||||
if (completeText?.length > 0) {
|
||||
const { chat } = await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: { text: completeText, sources, type: chatMode },
|
||||
threadId: thread?.id || null,
|
||||
user,
|
||||
});
|
||||
|
||||
writeResponseChunk(response, {
|
||||
uuid,
|
||||
type: "finalizeResponseStream",
|
||||
close: true,
|
||||
error: false,
|
||||
chatId: chat.id,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
writeResponseChunk(response, {
|
||||
uuid,
|
||||
type: "finalizeResponseStream",
|
||||
close: true,
|
||||
error: false,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
module.exports.ApiChatHandler = {
|
||||
chatSync,
|
||||
streamChat,
|
||||
};
|
@ -1,9 +1,7 @@
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { WorkspaceChats } = require("../../models/workspaceChats");
|
||||
const { resetMemory } = require("./commands/reset");
|
||||
const { getVectorDbClass, getLLMProvider } = require("../helpers");
|
||||
const { convertToPromptHistory } = require("../helpers/chat/responses");
|
||||
const { DocumentManager } = require("../DocumentManager");
|
||||
const { SlashCommandPresets } = require("../../models/slashCommandsPresets");
|
||||
|
||||
const VALID_COMMANDS = {
|
||||
@ -34,216 +32,6 @@ async function grepCommand(message, user = null) {
|
||||
return updatedMessage;
|
||||
}
|
||||
|
||||
async function chatWithWorkspace(
|
||||
workspace,
|
||||
message,
|
||||
chatMode = "chat",
|
||||
user = null,
|
||||
thread = null
|
||||
) {
|
||||
const uuid = uuidv4();
|
||||
const updatedMessage = await grepCommand(message, user);
|
||||
|
||||
if (Object.keys(VALID_COMMANDS).includes(updatedMessage)) {
|
||||
return await VALID_COMMANDS[updatedMessage](workspace, message, uuid, user);
|
||||
}
|
||||
|
||||
const LLMConnector = getLLMProvider({
|
||||
provider: workspace?.chatProvider,
|
||||
model: workspace?.chatModel,
|
||||
});
|
||||
const VectorDb = getVectorDbClass();
|
||||
|
||||
const messageLimit = workspace?.openAiHistory || 20;
|
||||
const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
|
||||
const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
|
||||
|
||||
// User is trying to query-mode chat a workspace that has no data in it - so
|
||||
// we should exit early as no information can be found under these conditions.
|
||||
if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
|
||||
const textResponse =
|
||||
workspace?.queryRefusalResponse ??
|
||||
"There is no relevant information in this workspace to answer your query.";
|
||||
|
||||
await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: {
|
||||
text: textResponse,
|
||||
sources: [],
|
||||
type: chatMode,
|
||||
},
|
||||
threadId: thread?.id || null,
|
||||
include: false,
|
||||
user,
|
||||
});
|
||||
|
||||
return {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
sources: [],
|
||||
close: true,
|
||||
error: null,
|
||||
textResponse,
|
||||
};
|
||||
}
|
||||
|
||||
// If we are here we know that we are in a workspace that is:
|
||||
// 1. Chatting in "chat" mode and may or may _not_ have embeddings
|
||||
// 2. Chatting in "query" mode and has at least 1 embedding
|
||||
let contextTexts = [];
|
||||
let sources = [];
|
||||
let pinnedDocIdentifiers = [];
|
||||
const { rawHistory, chatHistory } = await recentChatHistory({
|
||||
user,
|
||||
workspace,
|
||||
thread,
|
||||
messageLimit,
|
||||
chatMode,
|
||||
});
|
||||
|
||||
// See stream.js comment for more information on this implementation.
|
||||
await new DocumentManager({
|
||||
workspace,
|
||||
maxTokens: LLMConnector.promptWindowLimit(),
|
||||
})
|
||||
.pinnedDocs()
|
||||
.then((pinnedDocs) => {
|
||||
pinnedDocs.forEach((doc) => {
|
||||
const { pageContent, ...metadata } = doc;
|
||||
pinnedDocIdentifiers.push(sourceIdentifier(doc));
|
||||
contextTexts.push(doc.pageContent);
|
||||
sources.push({
|
||||
text:
|
||||
pageContent.slice(0, 1_000) +
|
||||
"...continued on in source document...",
|
||||
...metadata,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
const vectorSearchResults =
|
||||
embeddingsCount !== 0
|
||||
? await VectorDb.performSimilaritySearch({
|
||||
namespace: workspace.slug,
|
||||
input: message,
|
||||
LLMConnector,
|
||||
similarityThreshold: workspace?.similarityThreshold,
|
||||
topN: workspace?.topN,
|
||||
filterIdentifiers: pinnedDocIdentifiers,
|
||||
})
|
||||
: {
|
||||
contextTexts: [],
|
||||
sources: [],
|
||||
message: null,
|
||||
};
|
||||
|
||||
// Failed similarity search if it was run at all and failed.
|
||||
if (!!vectorSearchResults.message) {
|
||||
return {
|
||||
id: uuid,
|
||||
type: "abort",
|
||||
textResponse: null,
|
||||
sources: [],
|
||||
close: true,
|
||||
error: vectorSearchResults.message,
|
||||
};
|
||||
}
|
||||
|
||||
const { fillSourceWindow } = require("../helpers/chat");
|
||||
const filledSources = fillSourceWindow({
|
||||
nDocs: workspace?.topN || 4,
|
||||
searchResults: vectorSearchResults.sources,
|
||||
history: rawHistory,
|
||||
filterIdentifiers: pinnedDocIdentifiers,
|
||||
});
|
||||
|
||||
// Why does contextTexts get all the info, but sources only get current search?
|
||||
// This is to give the ability of the LLM to "comprehend" a contextual response without
|
||||
// populating the Citations under a response with documents the user "thinks" are irrelevant
|
||||
// due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
|
||||
// If a past citation was used to answer the question - that is visible in the history so it logically makes sense
|
||||
// and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
|
||||
// TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
|
||||
contextTexts = [...contextTexts, ...filledSources.contextTexts];
|
||||
sources = [...sources, ...vectorSearchResults.sources];
|
||||
|
||||
// If in query mode and no context chunks are found from search, backfill, or pins - do not
|
||||
// let the LLM try to hallucinate a response or use general knowledge and exit early
|
||||
if (chatMode === "query" && contextTexts.length === 0) {
|
||||
const textResponse =
|
||||
workspace?.queryRefusalResponse ??
|
||||
"There is no relevant information in this workspace to answer your query.";
|
||||
|
||||
await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: {
|
||||
text: textResponse,
|
||||
sources: [],
|
||||
type: chatMode,
|
||||
},
|
||||
threadId: thread?.id || null,
|
||||
include: false,
|
||||
user,
|
||||
});
|
||||
|
||||
return {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
sources: [],
|
||||
close: true,
|
||||
error: null,
|
||||
textResponse,
|
||||
};
|
||||
}
|
||||
|
||||
// Compress & Assemble message to ensure prompt passes token limit with room for response
|
||||
// and build system messages based on inputs and history.
|
||||
const messages = await LLMConnector.compressMessages(
|
||||
{
|
||||
systemPrompt: chatPrompt(workspace),
|
||||
userPrompt: updatedMessage,
|
||||
contextTexts,
|
||||
chatHistory,
|
||||
},
|
||||
rawHistory
|
||||
);
|
||||
|
||||
// Send the text completion.
|
||||
const textResponse = await LLMConnector.getChatCompletion(messages, {
|
||||
temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
|
||||
});
|
||||
|
||||
if (!textResponse) {
|
||||
return {
|
||||
id: uuid,
|
||||
type: "abort",
|
||||
textResponse: null,
|
||||
sources: [],
|
||||
close: true,
|
||||
error: "No text completion could be completed with this input.",
|
||||
};
|
||||
}
|
||||
|
||||
const { chat } = await WorkspaceChats.new({
|
||||
workspaceId: workspace.id,
|
||||
prompt: message,
|
||||
response: { text: textResponse, sources, type: chatMode },
|
||||
threadId: thread?.id || null,
|
||||
user,
|
||||
});
|
||||
return {
|
||||
id: uuid,
|
||||
type: "textResponse",
|
||||
close: true,
|
||||
error: null,
|
||||
chatId: chat.id,
|
||||
textResponse,
|
||||
sources,
|
||||
};
|
||||
}
|
||||
|
||||
async function recentChatHistory({
|
||||
user = null,
|
||||
workspace,
|
||||
|
Loading…
Reference in New Issue
Block a user