anything-llm/server/utils/chats/stream.js

const { v4: uuidv4 } = require("uuid");
const { WorkspaceChats } = require("../../models/workspaceChats");
const { getVectorDbClass, getLLMProvider } = require("../helpers");
const {
  grepCommand,
  recentChatHistory,
  VALID_COMMANDS,
  chatPrompt,
} = require(".");

function writeResponseChunk(response, data) {
  response.write(`data: ${JSON.stringify(data)}\n\n`);
  return;
}

async function streamChatWithWorkspace(
  response,
  workspace,
  message,
  chatMode = "chat",
  user = null
) {
  const uuid = uuidv4();
  const command = grepCommand(message);

  if (!!command && Object.keys(VALID_COMMANDS).includes(command)) {
    const data = await VALID_COMMANDS[command](workspace, message, uuid, user);
    writeResponseChunk(response, data);
    return;
  }

  const LLMConnector = getLLMProvider();
  const VectorDb = getVectorDbClass();
  const { safe, reasons = [] } = await LLMConnector.isSafe(message);
  if (!safe) {
    writeResponseChunk(response, {
      id: uuid,
      type: "abort",
      textResponse: null,
      sources: [],
      close: true,
      error: `This message was moderated and will not be allowed. Violations for ${reasons.join(
        ", "
      )} found.`,
    });
    return;
  }

  const messageLimit = workspace?.openAiHistory || 20;
  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
  if (!hasVectorizedSpace || embeddingsCount === 0) {
    // If there are no embeddings - chat like a normal LLM chat interface.
    return await streamEmptyEmbeddingChat({
      response,
      uuid,
      user,
      message,
      workspace,
      messageLimit,
      LLMConnector,
    });
  }

  let completeText;
  const { rawHistory, chatHistory } = await recentChatHistory(
    user,
    workspace,
    messageLimit,
    chatMode
  );
  const {
    contextTexts = [],
    sources = [],
    message: error,
  } = await VectorDb.performSimilaritySearch({
    namespace: workspace.slug,
    input: message,
    LLMConnector,
    similarityThreshold: workspace?.similarityThreshold,
  });

  // Failed similarity search.
  if (!!error) {
    writeResponseChunk(response, {
      id: uuid,
      type: "abort",
      textResponse: null,
      sources: [],
      close: true,
      error,
    });
    return;
  }

  // Compress message to ensure prompt passes token limit with room for response
  // and build system messages based on inputs and history.
  const messages = await LLMConnector.compressMessages(
    {
      systemPrompt: chatPrompt(workspace),
      userPrompt: message,
      contextTexts,
      chatHistory,
    },
    rawHistory
  );

  // If streaming is not explicitly enabled for connector
  // we do regular waiting of a response and send a single chunk.
  if (LLMConnector.streamingEnabled() !== true) {
    console.log(
      `\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
    );
    completeText = await LLMConnector.getChatCompletion(messages, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    writeResponseChunk(response, {
      uuid,
      sources,
      type: "textResponseChunk",
      textResponse: completeText,
      close: true,
      error: false,
    });
  } else {
    const stream = await LLMConnector.streamGetChatCompletion(messages, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    completeText = await handleStreamResponses(response, stream, {
      uuid,
      sources,
    });
  }

  await WorkspaceChats.new({
    workspaceId: workspace.id,
    prompt: message,
    response: { text: completeText, sources, type: chatMode },
    user,
  });
  return;
}

async function streamEmptyEmbeddingChat({
  response,
  uuid,
  user,
  message,
  workspace,
  messageLimit,
  LLMConnector,
}) {
  let completeText;
  const { rawHistory, chatHistory } = await recentChatHistory(
    user,
    workspace,
    messageLimit
  );

  // If streaming is not explicitly enabled for connector
  // we do regular waiting of a response and send a single chunk.
  if (LLMConnector.streamingEnabled() !== true) {
    console.log(
      `\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
    );
    completeText = await LLMConnector.sendChat(
      chatHistory,
      message,
      workspace,
      rawHistory
    );
    writeResponseChunk(response, {
      uuid,
      type: "textResponseChunk",
      textResponse: completeText,
      sources: [],
      close: true,
      error: false,
    });
  } else {
    const stream = await LLMConnector.streamChat(
      chatHistory,
      message,
      workspace,
      rawHistory
    );
    completeText = await handleStreamResponses(response, stream, {
      uuid,
      sources: [],
    });
  }

  await WorkspaceChats.new({
    workspaceId: workspace.id,
    prompt: message,
    response: { text: completeText, sources: [], type: "chat" },
    user,
  });
  return;
}

function handleStreamResponses(response, stream, responseProps) {
  const { uuid = uuidv4(), sources = [] } = responseProps;

  // If stream is not a regular OpenAI Stream (like if using native model)
  // we can just iterate the stream content instead.
  if (!stream.hasOwnProperty("data")) {
    return new Promise(async (resolve) => {
      let fullText = "";
      for await (const chunk of stream) {
        fullText += chunk.content;
        writeResponseChunk(response, {
          uuid,
          sources: [],
          type: "textResponseChunk",
          textResponse: chunk.content,
          close: false,
          error: false,
        });
      }

      writeResponseChunk(response, {
        uuid,
        sources,
        type: "textResponseChunk",
        textResponse: "",
        close: true,
        error: false,
      });
      resolve(fullText);
    });
  }

  return new Promise((resolve) => {
    let fullText = "";
    let chunk = "";
    stream.data.on("data", (data) => {
      const lines = data
        ?.toString()
        ?.split("\n")
        .filter((line) => line.trim() !== "");

      for (const line of lines) {
        let validJSON = false;
        const message = chunk + line.replace(/^data: /, "");

        // JSON chunk is incomplete and has not ended yet
        // so we need to stitch it together. You would think JSON
        // chunks would only come complete - but they don't!
        try {
          JSON.parse(message);
          validJSON = true;
        } catch {}

        if (!validJSON) {
          // It can be possible that the chunk decoding is running away
          // and the message chunk fails to append due to string length.
          // In this case abort the chunk and reset so we can continue.
          // ref: https://github.com/Mintplex-Labs/anything-llm/issues/416
          try {
            chunk += message;
          } catch (e) {
            console.error(`Chunk appending error`, e);
            chunk = "";
          }
          continue;
        } else {
          chunk = "";
        }

        if (message == "[DONE]") {
          writeResponseChunk(response, {
            uuid,
            sources,
            type: "textResponseChunk",
            textResponse: "",
            close: true,
            error: false,
          });
          resolve(fullText);
        } else {
          let finishReason = null;
          let token = "";
          try {
            const json = JSON.parse(message);
            token = json?.choices?.[0]?.delta?.content;
            finishReason = json?.choices?.[0]?.finish_reason || null;
          } catch {
            continue;
          }

          if (token) {
            fullText += token;
            writeResponseChunk(response, {
              uuid,
              sources: [],
              type: "textResponseChunk",
              textResponse: token,
              close: false,
              error: false,
            });
          }

          if (finishReason !== null) {
            writeResponseChunk(response, {
              uuid,
              sources,
              type: "textResponseChunk",
              textResponse: "",
              close: true,
              error: false,
            });
            resolve(fullText);
          }
        }
      }
    });
  });
}

module.exports = {
  streamChatWithWorkspace,
  writeResponseChunk,
};
Enable chat streaming for LLMs (#354) * [Draft] Enable chat streaming for LLMs * stream only, move sendChat to deprecated * Update TODO deprecation comments update console output color for streaming disabled 2023-11-14 00:07:30 +01:00			`const { v4: uuidv4 } = require("uuid");`
			`const { WorkspaceChats } = require("../../models/workspaceChats");`
			`const { getVectorDbClass, getLLMProvider } = require("../helpers");`
			`const {`
			`grepCommand,`
			`recentChatHistory,`
			`VALID_COMMANDS,`
			`chatPrompt,`
			`} = require(".");`

			`function writeResponseChunk(response, data) {`
			response.write(`data: ${JSON.stringify(data)}\n\n`);
			`return;`
			`}`

			`async function streamChatWithWorkspace(`
			`response,`
			`workspace,`
			`message,`
			`chatMode = "chat",`
			`user = null`
			`) {`
			`const uuid = uuidv4();`
			`const command = grepCommand(message);`

			`if (!!command && Object.keys(VALID_COMMANDS).includes(command)) {`
			`const data = await VALID_COMMANDS[command](workspace, message, uuid, user);`
			`writeResponseChunk(response, data);`
			`return;`
			`}`

			`const LLMConnector = getLLMProvider();`
			`const VectorDb = getVectorDbClass();`
			`const { safe, reasons = [] } = await LLMConnector.isSafe(message);`
			`if (!safe) {`
			`writeResponseChunk(response, {`
			`id: uuid,`
			`type: "abort",`
			`textResponse: null,`
			`sources: [],`
			`close: true,`
			error: `This message was moderated and will not be allowed. Violations for ${reasons.join(
			`", "`
			)} found.`,
			`});`
			`return;`
			`}`

			`const messageLimit = workspace?.openAiHistory \|\| 20;`
			`const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);`
			`const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);`
			`if (!hasVectorizedSpace \|\| embeddingsCount === 0) {`
			`// If there are no embeddings - chat like a normal LLM chat interface.`
			`return await streamEmptyEmbeddingChat({`
			`response,`
			`uuid,`
			`user,`
			`message,`
			`workspace,`
			`messageLimit,`
			`LLMConnector,`
			`});`
			`}`

			`let completeText;`
			`const { rawHistory, chatHistory } = await recentChatHistory(`
			`user,`
			`workspace,`
			`messageLimit,`
			`chatMode`
			`);`
			`const {`
			`contextTexts = [],`
			`sources = [],`
			`message: error,`
			`} = await VectorDb.performSimilaritySearch({`
			`namespace: workspace.slug,`
			`input: message,`
			`LLMConnector,`
			`similarityThreshold: workspace?.similarityThreshold,`
			`});`

			`// Failed similarity search.`
			`if (!!error) {`
			`writeResponseChunk(response, {`
			`id: uuid,`
			`type: "abort",`
			`textResponse: null,`
			`sources: [],`
			`close: true,`
			`error,`
			`});`
			`return;`
			`}`

			`// Compress message to ensure prompt passes token limit with room for response`
			`// and build system messages based on inputs and history.`
			`const messages = await LLMConnector.compressMessages(`
			`{`
			`systemPrompt: chatPrompt(workspace),`
			`userPrompt: message,`
			`contextTexts,`
			`chatHistory,`
			`},`
			`rawHistory`
			`);`

			`// If streaming is not explicitly enabled for connector`
			`// we do regular waiting of a response and send a single chunk.`
			`if (LLMConnector.streamingEnabled() !== true) {`
			`console.log(`
			`\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
			`);`
			`completeText = await LLMConnector.getChatCompletion(messages, {`
			`temperature: workspace?.openAiTemp ?? 0.7,`
			`});`
			`writeResponseChunk(response, {`
			`uuid,`
			`sources,`
			`type: "textResponseChunk",`
			`textResponse: completeText,`
			`close: true,`
			`error: false,`
			`});`
			`} else {`
			`const stream = await LLMConnector.streamGetChatCompletion(messages, {`
			`temperature: workspace?.openAiTemp ?? 0.7,`
			`});`
			`completeText = await handleStreamResponses(response, stream, {`
			`uuid,`
			`sources,`
			`});`
			`}`

			`await WorkspaceChats.new({`
			`workspaceId: workspace.id,`
			`prompt: message,`
			`response: { text: completeText, sources, type: chatMode },`
			`user,`
			`});`
			`return;`
			`}`

			`async function streamEmptyEmbeddingChat({`
			`response,`
			`uuid,`
			`user,`
			`message,`
			`workspace,`
			`messageLimit,`
			`LLMConnector,`
			`}) {`
			`let completeText;`
			`const { rawHistory, chatHistory } = await recentChatHistory(`
			`user,`
			`workspace,`
			`messageLimit`
			`);`

			`// If streaming is not explicitly enabled for connector`
			`// we do regular waiting of a response and send a single chunk.`
			`if (LLMConnector.streamingEnabled() !== true) {`
			`console.log(`
			`\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
			`);`
			`completeText = await LLMConnector.sendChat(`
			`chatHistory,`
			`message,`
			`workspace,`
			`rawHistory`
			`);`
			`writeResponseChunk(response, {`
			`uuid,`
			`type: "textResponseChunk",`
			`textResponse: completeText,`
			`sources: [],`
			`close: true,`
			`error: false,`
			`});`
			`} else {`
			`const stream = await LLMConnector.streamChat(`
			`chatHistory,`
			`message,`
			`workspace,`
			`rawHistory`
			`);`
			`completeText = await handleStreamResponses(response, stream, {`
			`uuid,`
			`sources: [],`
			`});`
			`}`

			`await WorkspaceChats.new({`
			`workspaceId: workspace.id,`
			`prompt: message,`
			`response: { text: completeText, sources: [], type: "chat" },`
			`user,`
			`});`
			`return;`
			`}`

			`function handleStreamResponses(response, stream, responseProps) {`
			`const { uuid = uuidv4(), sources = [] } = responseProps;`
[Feature] AnythingLLM use locally hosted Llama.cpp and GGUF files for inferencing (#413) * Implement use of native embedder (all-Mini-L6-v2) stop showing prisma queries during dev * Add native embedder as an available embedder selection * wrap model loader in try/catch * print progress on download * add built-in LLM support (expiermental) * Update to progress output for embedder * move embedder selection options to component * saftey checks for modelfile * update ref * Hide selection when on hosted subdomain * update documentation hide localLlama when on hosted * saftey checks for storage of models * update dockerfile to pre-build Llama.cpp bindings * update lockfile * add langchain doc comment * remove extraneous --no-metal option * Show data handling for private LLM * persist model in memory for N+1 chats * update import update dev comment on token model size * update primary README * chore: more readme updates and remove screenshots - too much to maintain, just use the app! * remove screeshot link 2023-12-07 23:48:27 +01:00
			`// If stream is not a regular OpenAI Stream (like if using native model)`
			`// we can just iterate the stream content instead.`
			`if (!stream.hasOwnProperty("data")) {`
			`return new Promise(async (resolve) => {`
			`let fullText = "";`
			`for await (const chunk of stream) {`
			`fullText += chunk.content;`
			`writeResponseChunk(response, {`
			`uuid,`
			`sources: [],`
			`type: "textResponseChunk",`
			`textResponse: chunk.content,`
			`close: false,`
			`error: false,`
			`});`
			`}`

			`writeResponseChunk(response, {`
			`uuid,`
			`sources,`
			`type: "textResponseChunk",`
			`textResponse: "",`
			`close: true,`
			`error: false,`
			`});`
			`resolve(fullText);`
			`});`
			`}`

Enable chat streaming for LLMs (#354) * [Draft] Enable chat streaming for LLMs * stream only, move sendChat to deprecated * Update TODO deprecation comments update console output color for streaming disabled 2023-11-14 00:07:30 +01:00			`return new Promise((resolve) => {`
			`let fullText = "";`
			`let chunk = "";`
			`stream.data.on("data", (data) => {`
			`const lines = data`
			`?.toString()`
			`?.split("\n")`
			`.filter((line) => line.trim() !== "");`

			`for (const line of lines) {`
Support LocalAi as LLM provider by @tlandenberger (#373) * feature: add LocalAI as llm provider * update Onboarding/mgmt settings Grab models from models endpoint for localai merge with master * update streaming for complete chunk streaming update localAI LLM to be able to stream * force schema on URL --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> Co-authored-by: tlandenberger <tobiaslandenberger@gmail.com> 2023-11-14 21:31:44 +01:00			`let validJSON = false;`
Enable chat streaming for LLMs (#354) * [Draft] Enable chat streaming for LLMs * stream only, move sendChat to deprecated * Update TODO deprecation comments update console output color for streaming disabled 2023-11-14 00:07:30 +01:00			`const message = chunk + line.replace(/^data: /, "");`

			`// JSON chunk is incomplete and has not ended yet`
			`// so we need to stitch it together. You would think JSON`
			`// chunks would only come complete - but they don't!`
Support LocalAi as LLM provider by @tlandenberger (#373) * feature: add LocalAI as llm provider * update Onboarding/mgmt settings Grab models from models endpoint for localai merge with master * update streaming for complete chunk streaming update localAI LLM to be able to stream * force schema on URL --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> Co-authored-by: tlandenberger <tobiaslandenberger@gmail.com> 2023-11-14 21:31:44 +01:00			`try {`
			`JSON.parse(message);`
			`validJSON = true;`
			`} catch {}`

			`if (!validJSON) {`
patch: implement @lunamidori hotfix for LocalAI streaming chunk overflows (#433) * patch: implement @lunamidori hotfix for LocalAI streaming chunk overflows resolves #416 * change log to error log * log trace * lint 2023-12-13 01:20:06 +01:00			`// It can be possible that the chunk decoding is running away`
			`// and the message chunk fails to append due to string length.`
			`// In this case abort the chunk and reset so we can continue.`
			`// ref: https://github.com/Mintplex-Labs/anything-llm/issues/416`
			`try {`
			`chunk += message;`
			`} catch (e) {`
			console.error(`Chunk appending error`, e);
			`chunk = "";`
			`}`
Enable chat streaming for LLMs (#354) * [Draft] Enable chat streaming for LLMs * stream only, move sendChat to deprecated * Update TODO deprecation comments update console output color for streaming disabled 2023-11-14 00:07:30 +01:00			`continue;`
			`} else {`
			`chunk = "";`
			`}`

			`if (message == "[DONE]") {`
			`writeResponseChunk(response, {`
			`uuid,`
			`sources,`
			`type: "textResponseChunk",`
			`textResponse: "",`
			`close: true,`
			`error: false,`
			`});`
			`resolve(fullText);`
			`} else {`
Support LocalAi as LLM provider by @tlandenberger (#373) * feature: add LocalAI as llm provider * update Onboarding/mgmt settings Grab models from models endpoint for localai merge with master * update streaming for complete chunk streaming update localAI LLM to be able to stream * force schema on URL --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> Co-authored-by: tlandenberger <tobiaslandenberger@gmail.com> 2023-11-14 21:31:44 +01:00			`let finishReason = null;`
Enable chat streaming for LLMs (#354) * [Draft] Enable chat streaming for LLMs * stream only, move sendChat to deprecated * Update TODO deprecation comments update console output color for streaming disabled 2023-11-14 00:07:30 +01:00			`let token = "";`
			`try {`
			`const json = JSON.parse(message);`
			`token = json?.choices?.[0]?.delta?.content;`
Support LocalAi as LLM provider by @tlandenberger (#373) * feature: add LocalAI as llm provider * update Onboarding/mgmt settings Grab models from models endpoint for localai merge with master * update streaming for complete chunk streaming update localAI LLM to be able to stream * force schema on URL --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> Co-authored-by: tlandenberger <tobiaslandenberger@gmail.com> 2023-11-14 21:31:44 +01:00			`finishReason = json?.choices?.[0]?.finish_reason \|\| null;`
Enable chat streaming for LLMs (#354) * [Draft] Enable chat streaming for LLMs * stream only, move sendChat to deprecated * Update TODO deprecation comments update console output color for streaming disabled 2023-11-14 00:07:30 +01:00			`} catch {`
			`continue;`
			`}`

			`if (token) {`
			`fullText += token;`
			`writeResponseChunk(response, {`
			`uuid,`
			`sources: [],`
			`type: "textResponseChunk",`
			`textResponse: token,`
			`close: false,`
			`error: false,`
			`});`
			`}`

			`if (finishReason !== null) {`
			`writeResponseChunk(response, {`
			`uuid,`
			`sources,`
			`type: "textResponseChunk",`
			`textResponse: "",`
			`close: true,`
			`error: false,`
			`});`
			`resolve(fullText);`
			`}`
			`}`
			`}`
			`});`
			`});`
			`}`

			`module.exports = {`
			`streamChatWithWorkspace,`
			`writeResponseChunk,`
			`};`