anything-llm/server/utils/chats/stream.js
Timothy Carambat c59ab9da0a
Refactor LLM chat backend (#717)
* refactor stream/chat/embed-stram to be a single execution logic path so that it is easier to maintain and build upon

* no thread in sync chat since only api uses it
adjust import locations
2024-02-14 12:32:07 -08:00

192 lines
5.1 KiB
JavaScript

const { v4: uuidv4 } = require("uuid");
const { WorkspaceChats } = require("../../models/workspaceChats");
const { getVectorDbClass, getLLMProvider } = require("../helpers");
const { writeResponseChunk } = require("../helpers/chat/responses");
const {
grepCommand,
VALID_COMMANDS,
chatPrompt,
recentChatHistory,
} = require("./index");
const VALID_CHAT_MODE = ["chat", "query"];
async function streamChatWithWorkspace(
response,
workspace,
message,
chatMode = "chat",
user = null,
thread = null
) {
const uuid = uuidv4();
const command = grepCommand(message);
if (!!command && Object.keys(VALID_COMMANDS).includes(command)) {
const data = await VALID_COMMANDS[command](
workspace,
message,
uuid,
user,
thread
);
writeResponseChunk(response, data);
return;
}
const LLMConnector = getLLMProvider(workspace?.chatModel);
const VectorDb = getVectorDbClass();
const { safe, reasons = [] } = await LLMConnector.isSafe(message);
if (!safe) {
writeResponseChunk(response, {
id: uuid,
type: "abort",
textResponse: null,
sources: [],
close: true,
error: `This message was moderated and will not be allowed. Violations for ${reasons.join(
", "
)} found.`,
});
return;
}
const messageLimit = workspace?.openAiHistory || 20;
const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
// User is trying to query-mode chat a workspace that has no data in it - so
// we should exit early as no information can be found under these conditions.
if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
writeResponseChunk(response, {
id: uuid,
type: "textResponse",
textResponse:
"There is no relevant information in this workspace to answer your query.",
sources: [],
close: true,
error: null,
});
return;
}
// If we are here we know that we are in a workspace that is:
// 1. Chatting in "chat" mode and may or may _not_ have embeddings
// 2. Chatting in "query" mode and has at least 1 embedding
let completeText;
const { rawHistory, chatHistory } = await recentChatHistory({
user,
workspace,
thread,
messageLimit,
chatMode,
});
const {
contextTexts = [],
sources = [],
message: error,
} = embeddingsCount !== 0 // if there no embeddings don't bother searching.
? await VectorDb.performSimilaritySearch({
namespace: workspace.slug,
input: message,
LLMConnector,
similarityThreshold: workspace?.similarityThreshold,
topN: workspace?.topN,
})
: {
contextTexts: [],
sources: [],
message: null,
};
// Failed similarity search if it was run at all and failed.
if (!!error) {
writeResponseChunk(response, {
id: uuid,
type: "abort",
textResponse: null,
sources: [],
close: true,
error,
});
return;
}
// If in query mode and no sources are found, do not
// let the LLM try to hallucinate a response or use general knowledge and exit early
if (chatMode === "query" && sources.length === 0) {
writeResponseChunk(response, {
id: uuid,
type: "textResponse",
textResponse:
"There is no relevant information in this workspace to answer your query.",
sources: [],
close: true,
error: null,
});
return;
}
// Compress & Assemble message to ensure prompt passes token limit with room for response
// and build system messages based on inputs and history.
const messages = await LLMConnector.compressMessages(
{
systemPrompt: chatPrompt(workspace),
userPrompt: message,
contextTexts,
chatHistory,
},
rawHistory
);
// If streaming is not explicitly enabled for connector
// we do regular waiting of a response and send a single chunk.
if (LLMConnector.streamingEnabled() !== true) {
console.log(
`\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
);
completeText = await LLMConnector.getChatCompletion(messages, {
temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
});
writeResponseChunk(response, {
uuid,
sources,
type: "textResponseChunk",
textResponse: completeText,
close: true,
error: false,
});
} else {
const stream = await LLMConnector.streamGetChatCompletion(messages, {
temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
});
completeText = await LLMConnector.handleStream(response, stream, {
uuid,
sources,
});
}
const { chat } = await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: message,
response: { text: completeText, sources, type: chatMode },
threadId: thread?.id || null,
user,
});
writeResponseChunk(response, {
uuid,
type: "finalizeResponseStream",
close: true,
error: false,
chatId: chat.id,
});
return;
}
module.exports = {
VALID_CHAT_MODE,
streamChatWithWorkspace,
};