Add OpenAI compatible API support (#1757)

2024-11-15 02:50:10 +01:00 · 2024-06-24 18:28:21 -07:00 · 2024-06-24 18:28:21 -07:00 · 88a0335fd2
commit 88a0335fd2
parent 158ba83cab
7 changed files with 1104 additions and 104 deletions
--- a/server/endpoints/api/index.js
+++ b/server/endpoints/api/index.js
@ -6,6 +6,7 @@ const { apiSystemEndpoints } = require("./system");
 const { apiWorkspaceEndpoints } = require("./workspace");
 const { apiWorkspaceThreadEndpoints } = require("./workspaceThread");
 const { apiUserManagementEndpoints } = require("./userManagement");
+const { apiOpenAICompatibleEndpoints } = require("./openai");

 // All endpoints must be documented and pass through the validApiKey Middleware.
 // How to JSDoc an endpoint
@ -20,6 +21,7 @@ function developerEndpoints(app, router) {
  apiDocumentEndpoints(router);
  apiWorkspaceThreadEndpoints(router);
  apiUserManagementEndpoints(router);
+  apiOpenAICompatibleEndpoints(router);
 }

 module.exports = { developerEndpoints };
--- a/server/endpoints/api/openai/compatibility-test-script.cjs
+++ b/server/endpoints/api/openai/compatibility-test-script.cjs
@ -0,0 +1,64 @@
+const OpenAI = require("openai");
+
+/**
+ * @type {import("openai").OpenAI}
+ */
+const client = new OpenAI({
+  baseURL: "http://localhost:3001/api/v1/openai",
+  apiKey: "ENTER_ANYTHINGLLM_API_KEY_HERE",
+});
+
+(async () => {
+  // Models endpoint testing.
+  console.log("Fetching /models");
+  const modelList = await client.models.list();
+  for await (const model of modelList) {
+    console.log({ model });
+  }
+
+  // Test sync chat completion
+  console.log("Running synchronous chat message");
+  const syncCompletion = await client.chat.completions.create({
+    messages: [
+      {
+        role: "system",
+        content: "You are a helpful assistant who only speaks like a pirate.",
+      },
+      { role: "user", content: "What is AnythingLLM?" },
+      // {
+      //   role: 'assistant',
+      //   content: "Arrr, matey! AnythingLLM be a fine tool fer sailin' the treacherous sea o' information with a powerful language model at yer helm. It's a potent instrument to handle all manner o' tasks involvin' text, like answerin' questions, generating prose, or even havin' a chat with digital scallywags like meself. Be there any specific treasure ye seek in the realm o' AnythingLLM?"
+      // },
+      // { role: "user", content: "Why are you talking like a pirate?" },
+    ],
+    model: "anythingllm", // must be workspace-slug
+  });
+  console.log(syncCompletion.choices[0]);
+
+  // Test sync chat streaming completion
+  console.log("Running asynchronous chat message");
+  const asyncCompletion = await client.chat.completions.create({
+    messages: [
+      {
+        role: "system",
+        content: "You are a helpful assistant who only speaks like a pirate.",
+      },
+      { role: "user", content: "What is AnythingLLM?" },
+    ],
+    model: "anythingllm", // must be workspace-slug
+    stream: true,
+  });
+
+  let message = "";
+  for await (const chunk of asyncCompletion) {
+    message += chunk.choices[0].delta.content;
+    console.log({ message });
+  }
+
+  // Vector DB functionality
+  console.log("Fetching /vector_stores");
+  const vectorDBList = await client.beta.vectorStores.list();
+  for await (const db of vectorDBList) {
+    console.log(db);
+  }
+})();
--- a/server/endpoints/api/openai/index.js
+++ b/server/endpoints/api/openai/index.js
@ -0,0 +1,331 @@
+const { v4: uuidv4 } = require("uuid");
+const { Document } = require("../../../models/documents");
+const { Telemetry } = require("../../../models/telemetry");
+const { Workspace } = require("../../../models/workspace");
+const {
+  getLLMProvider,
+  getEmbeddingEngineSelection,
+} = require("../../../utils/helpers");
+const { reqBody } = require("../../../utils/http");
+const { validApiKey } = require("../../../utils/middleware/validApiKey");
+const { EventLogs } = require("../../../models/eventLogs");
+const {
+  OpenAICompatibleChat,
+} = require("../../../utils/chats/openaiCompatible");
+
+function apiOpenAICompatibleEndpoints(app) {
+  if (!app) return;
+
+  app.get("/v1/openai/models", [validApiKey], async (request, response) => {
+    /*
+    #swagger.tags = ['OpenAI Compatible Endpoints']
+    #swagger.description = 'Get all available "models" which are workspaces you can use for chatting.'
+    #swagger.responses[200] = {
+      content: {
+        "application/json": {
+          "schema": {
+            "type": "object",
+            "example": {
+              "models": [
+                {
+                  "name": "Sample workspace",
+                  "model": "sample-workspace",
+                  "llm": {
+                    "provider": "ollama",
+                    "model": "llama3:8b"
+                  }
+                },
+                {
+                  "name": "Second workspace",
+                  "model": "workspace-2",
+                  "llm": {
+                    "provider": "openai",
+                    "model": "gpt-3.5-turbo"
+                  }
+                }
+              ]
+            }
+          }
+        }
+      }
+    }
+    #swagger.responses[403] = {
+      schema: {
+        "$ref": "#/definitions/InvalidAPIKey"
+      }
+    }
+    */
+    try {
+      const data = [];
+      const workspaces = await Workspace.where();
+      for (const workspace of workspaces) {
+        const provider = workspace?.chatProvider ?? process.env.LLM_PROVIDER;
+        let LLMProvider = getLLMProvider({
+          provider,
+          model: workspace?.chatModel,
+        });
+        data.push({
+          name: workspace.name,
+          model: workspace.slug,
+          llm: {
+            provider: provider,
+            model: LLMProvider.model,
+          },
+        });
+      }
+      return response.status(200).json({ data });
+    } catch (e) {
+      console.log(e.message, e);
+      response.sendStatus(500).end();
+    }
+  });
+
+  app.post(
+    "/v1/openai/chat/completions",
+    [validApiKey],
+    async (request, response) => {
+      /*
+      #swagger.tags = ['OpenAI Compatible Endpoints']
+      #swagger.description = 'Execute a chat with a workspace with OpenAI compatibility. Supports streaming as well. Model must be a workspace slug from /models.'
+      #swagger.requestBody = {
+          description: 'Send a prompt to the workspace with full use of documents as if sending a chat in AnythingLLM. Only supports some values of OpenAI API. See example below.',
+          required: true,
+          type: 'object',
+          content: {
+            "application/json": {
+              example: {
+                messages: [
+                {"role":"system", content: "You are a helpful assistant"},
+                {"role":"user", content: "What is AnythingLLM?"},
+                {"role":"assistant", content: "AnythingLLM is...."},
+                {"role":"user", content: "Follow up question..."}
+                ],
+                model: "sample-workspace",
+                stream: true,
+                temperature: 0.7
+              }
+            }
+          }
+        }
+      #swagger.responses[403] = {
+        schema: {
+          "$ref": "#/definitions/InvalidAPIKey"
+        }
+      }
+      */
+      try {
+        const {
+          model,
+          messages = [],
+          temperature,
+          stream = false,
+        } = reqBody(request);
+        const workspace = await Workspace.get({ slug: String(model) });
+        if (!workspace) return response.status(401).end();
+
+        const userMessage = messages.pop();
+        if (userMessage.role !== "user") {
+          return response.status(400).json({
+            id: uuidv4(),
+            type: "abort",
+            textResponse: null,
+            sources: [],
+            close: true,
+            error:
+              "No user prompt found. Must be last element in message array with 'user' role.",
+          });
+        }
+
+        const systemPrompt =
+          messages.find((chat) => chat.role === "system")?.content ?? null;
+        const history = messages.filter((chat) => chat.role !== "system") ?? [];
+
+        if (!stream) {
+          const chatResult = await OpenAICompatibleChat.chatSync({
+            workspace,
+            systemPrompt,
+            history,
+            prompt: userMessage.content,
+            temperature: Number(temperature),
+          });
+
+          await Telemetry.sendTelemetry("sent_chat", {
+            LLMSelection:
+              workspace.chatProvider ?? process.env.LLM_PROVIDER ?? "openai",
+            Embedder: process.env.EMBEDDING_ENGINE || "inherit",
+            VectorDbSelection: process.env.VECTOR_DB || "lancedb",
+          });
+          await EventLogs.logEvent("api_sent_chat", {
+            workspaceName: workspace?.name,
+            chatModel: workspace?.chatModel || "System Default",
+          });
+          return response.status(200).json(chatResult);
+        }
+
+        response.setHeader("Cache-Control", "no-cache");
+        response.setHeader("Content-Type", "text/event-stream");
+        response.setHeader("Access-Control-Allow-Origin", "*");
+        response.setHeader("Connection", "keep-alive");
+        response.flushHeaders();
+
+        await OpenAICompatibleChat.streamChat({
+          workspace,
+          systemPrompt,
+          history,
+          prompt: userMessage.content,
+          temperature: Number(temperature),
+          response,
+        });
+        await Telemetry.sendTelemetry("sent_chat", {
+          LLMSelection: process.env.LLM_PROVIDER || "openai",
+          Embedder: process.env.EMBEDDING_ENGINE || "inherit",
+          VectorDbSelection: process.env.VECTOR_DB || "lancedb",
+        });
+        await EventLogs.logEvent("api_sent_chat", {
+          workspaceName: workspace?.name,
+          chatModel: workspace?.chatModel || "System Default",
+        });
+        response.end();
+      } catch (e) {
+        console.log(e.message, e);
+        response.status(500).end();
+      }
+    }
+  );
+
+  app.post(
+    "/v1/openai/embeddings",
+    [validApiKey],
+    async (request, response) => {
+      /*
+      #swagger.tags = ['OpenAI Compatible Endpoints']
+      #swagger.description = 'Get the embeddings of any arbitrary text string. This will use the embedder provider set in the system. Please ensure the token length of each string fits within the context of your embedder model.'
+      #swagger.requestBody = {
+          description: 'The input string(s) to be embedded. If the text is too long for the embedder model context, it will fail to embed. The vector and associated chunk metadata will be returned in the array order provided',
+          required: true,
+          type: 'object',
+          content: {
+            "application/json": {
+              example: {
+                input: [
+                "This is my first string to embed",
+                "This is my second string to embed",
+                ],
+                model: null,
+              }
+            }
+          }
+        }
+      #swagger.responses[403] = {
+        schema: {
+          "$ref": "#/definitions/InvalidAPIKey"
+        }
+      }
+      */
+      try {
+        const { inputs = [] } = reqBody(request);
+        const validArray = inputs.every((input) => typeof input === "string");
+        if (!validArray)
+          throw new Error("All inputs to be embedded must be strings.");
+
+        const Embedder = getEmbeddingEngineSelection();
+        const embeddings = await Embedder.embedChunks(inputs);
+        const data = [];
+        embeddings.forEach((embedding, index) => {
+          data.push({
+            object: "embedding",
+            embedding,
+            index,
+          });
+        });
+
+        return response.status(200).json({
+          object: "list",
+          data,
+          model: Embedder.model,
+        });
+      } catch (e) {
+        console.log(e.message, e);
+        response.status(500).end();
+      }
+    }
+  );
+
+  app.get(
+    "/v1/openai/vector_stores",
+    [validApiKey],
+    async (request, response) => {
+      /*
+      #swagger.tags = ['OpenAI Compatible Endpoints']
+      #swagger.description = 'List all the vector database collections connected to AnythingLLM. These are essentially workspaces but return their unique vector db identifier - this is the same as the workspace slug.'
+      #swagger.responses[200] = {
+        content: {
+          "application/json": {
+            "schema": {
+              "type": "object",
+              "example": {
+                "data": [
+                  {
+                    "id": "slug-here",
+                    "object": "vector_store",
+                    "name": "My workspace",
+                    "file_counts": {
+                      "total": 3
+                    },
+                    "provider": "LanceDB"
+                  }
+                ]
+              }
+            }
+          }
+        }
+      }
+      #swagger.responses[403] = {
+        schema: {
+          "$ref": "#/definitions/InvalidAPIKey"
+        }
+      }
+      */
+      try {
+        // We dump all in the first response and despite saying there is
+        // not more data the library still checks with a query param so if
+        // we detect one - respond with nothing.
+        if (Object.keys(request?.query ?? {}).length !== 0) {
+          return response.status(200).json({
+            data: [],
+            has_more: false,
+          });
+        }
+
+        const data = [];
+        const VectorDBProvider = process.env.VECTOR_DB || "lancedb";
+        const workspaces = await Workspace.where();
+
+        for (const workspace of workspaces) {
+          data.push({
+            id: workspace.slug,
+            object: "vector_store",
+            name: workspace.name,
+            file_counts: {
+              total: await Document.count({
+                workspaceId: Number(workspace.id),
+              }),
+            },
+            provider: VectorDBProvider,
+          });
+        }
+        return response.status(200).json({
+          first_id: [...data].splice(0)?.[0]?.id,
+          last_id: [...data].splice(-1)?.[0]?.id ?? data.splice(1)?.[0]?.id,
+          data,
+          has_more: false,
+        });
+      } catch (e) {
+        console.log(e.message, e);
+        response.status(500).end();
+      }
+    }
+  );
+}
+
+module.exports = { apiOpenAICompatibleEndpoints };
--- a/server/swagger/init.js
+++ b/server/swagger/init.js
@ -37,6 +37,7 @@ const endpointsFiles = [
  "../endpoints/api/system/index.js",
  "../endpoints/api/workspaceThread/index.js",
  "../endpoints/api/userManagement/index.js",
+  "../endpoints/api/openai/index.js",
 ];

 swaggerAutogen(outputFile, endpointsFiles, doc).then(({ data }) => {
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@ -2907,6 +2907,235 @@
          }
        }
      }
+    },
+    "/v1/openai/models": {
+      "get": {
+        "tags": [
+          "OpenAI Compatible Endpoints"
+        ],
+        "description": "Get all available \"models\" which are workspaces you can use for chatting.",
+        "parameters": [],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "models": [
+                      {
+                        "name": "Sample workspace",
+                        "model": "sample-workspace",
+                        "llm": {
+                          "provider": "ollama",
+                          "model": "llama3:8b"
+                        }
+                      },
+                      {
+                        "name": "Second workspace",
+                        "model": "workspace-2",
+                        "llm": {
+                          "provider": "openai",
+                          "model": "gpt-3.5-turbo"
+                        }
+                      }
+                    ]
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        }
+      }
+    },
+    "/v1/openai/chat/completions": {
+      "post": {
+        "tags": [
+          "OpenAI Compatible Endpoints"
+        ],
+        "description": "Execute a chat with a workspace with OpenAI compatibility. Supports streaming as well. Model must be a workspace slug from /models.",
+        "parameters": [],
+        "responses": {
+          "200": {
+            "description": "OK"
+          },
+          "400": {
+            "description": "Bad Request"
+          },
+          "401": {
+            "description": "Unauthorized"
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        },
+        "requestBody": {
+          "description": "Send a prompt to the workspace with full use of documents as if sending a chat in AnythingLLM. Only supports some values of OpenAI API. See example below.",
+          "required": true,
+          "type": "object",
+          "content": {
+            "application/json": {
+              "example": {
+                "messages": [
+                  {
+                    "role": "system",
+                    "content": "You are a helpful assistant"
+                  },
+                  {
+                    "role": "user",
+                    "content": "What is AnythingLLM?"
+                  },
+                  {
+                    "role": "assistant",
+                    "content": "AnythingLLM is...."
+                  },
+                  {
+                    "role": "user",
+                    "content": "Follow up question..."
+                  }
+                ],
+                "model": "sample-workspace",
+                "stream": true,
+                "temperature": 0.7
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/openai/embeddings": {
+      "post": {
+        "tags": [
+          "OpenAI Compatible Endpoints"
+        ],
+        "description": "Get the embeddings of any arbitrary text string. This will use the embedder provider set in the system. Please ensure the token length of each string fits within the context of your embedder model.",
+        "parameters": [],
+        "responses": {
+          "200": {
+            "description": "OK"
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        },
+        "requestBody": {
+          "description": "The input string(s) to be embedded. If the text is too long for the embedder model context, it will fail to embed. The vector and associated chunk metadata will be returned in the array order provided",
+          "required": true,
+          "type": "object",
+          "content": {
+            "application/json": {
+              "example": {
+                "input": [
+                  "This is my first string to embed",
+                  "This is my second string to embed"
+                ],
+                "model": null
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/openai/vector_stores": {
+      "get": {
+        "tags": [
+          "OpenAI Compatible Endpoints"
+        ],
+        "description": "List all the vector database collections connected to AnythingLLM. These are essentially workspaces but return their unique vector db identifier - this is the same as the workspace slug.",
+        "parameters": [],
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "example": {
+                    "data": [
+                      {
+                        "id": "slug-here",
+                        "object": "vector_store",
+                        "name": "My workspace",
+                        "file_counts": {
+                          "total": 3
+                        },
+                        "provider": "LanceDB"
+                      }
+                    ]
+                  }
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "Forbidden",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              },
+              "application/xml": {
+                "schema": {
+                  "$ref": "#/components/schemas/InvalidAPIKey"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Internal Server Error"
+          }
+        }
+      }
    }
  },
  "components": {
--- a/server/utils/chats/openaiCompatible.js
+++ b/server/utils/chats/openaiCompatible.js
@ -0,0 +1,477 @@
+const { v4: uuidv4 } = require("uuid");
+const { DocumentManager } = require("../DocumentManager");
+const { WorkspaceChats } = require("../../models/workspaceChats");
+const { getVectorDbClass, getLLMProvider } = require("../helpers");
+const { writeResponseChunk } = require("../helpers/chat/responses");
+const { chatPrompt, sourceIdentifier } = require("./index");
+
+const { PassThrough } = require("stream");
+
+async function chatSync({
+  workspace,
+  systemPrompt = null,
+  history = [],
+  prompt = null,
+  temperature = null,
+}) {
+  const uuid = uuidv4();
+  const chatMode = workspace?.chatMode ?? "chat";
+  const LLMConnector = getLLMProvider({
+    provider: workspace?.chatProvider,
+    model: workspace?.chatModel,
+  });
+  const VectorDb = getVectorDbClass();
+  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
+  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
+
+  // User is trying to query-mode chat a workspace that has no data in it - so
+  // we should exit early as no information can be found under these conditions.
+  if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: String(prompt),
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+      },
+      include: false,
+    });
+
+    return formatJSON(
+      {
+        id: uuid,
+        type: "textResponse",
+        sources: [],
+        close: true,
+        error: null,
+        textResponse,
+      },
+      { model: workspace.slug, finish_reason: "abort" }
+    );
+  }
+
+  // If we are here we know that we are in a workspace that is:
+  // 1. Chatting in "chat" mode and may or may _not_ have embeddings
+  // 2. Chatting in "query" mode and has at least 1 embedding
+  let contextTexts = [];
+  let sources = [];
+  let pinnedDocIdentifiers = [];
+  await new DocumentManager({
+    workspace,
+    maxTokens: LLMConnector.promptWindowLimit(),
+  })
+    .pinnedDocs()
+    .then((pinnedDocs) => {
+      pinnedDocs.forEach((doc) => {
+        const { pageContent, ...metadata } = doc;
+        pinnedDocIdentifiers.push(sourceIdentifier(doc));
+        contextTexts.push(doc.pageContent);
+        sources.push({
+          text:
+            pageContent.slice(0, 1_000) +
+            "...continued on in source document...",
+          ...metadata,
+        });
+      });
+    });
+
+  const vectorSearchResults =
+    embeddingsCount !== 0
+      ? await VectorDb.performSimilaritySearch({
+          namespace: workspace.slug,
+          input: prompt,
+          LLMConnector,
+          similarityThreshold: workspace?.similarityThreshold,
+          topN: workspace?.topN,
+          filterIdentifiers: pinnedDocIdentifiers,
+        })
+      : {
+          contextTexts: [],
+          sources: [],
+          message: null,
+        };
+
+  // Failed similarity search if it was run at all and failed.
+  if (!!vectorSearchResults.message) {
+    return formatJSON(
+      {
+        id: uuid,
+        type: "abort",
+        textResponse: null,
+        sources: [],
+        close: true,
+        error: vectorSearchResults.message,
+      },
+      { model: workspace.slug, finish_reason: "abort" }
+    );
+  }
+
+  // For OpenAI Compatible chats, we cannot do backfilling so we simply aggregate results here.
+  contextTexts = [...contextTexts];
+  sources = [...sources];
+
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
+  // let the LLM try to hallucinate a response or use general knowledge and exit early
+  if (chatMode === "query" && contextTexts.length === 0) {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: prompt,
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+      },
+      include: false,
+    });
+
+    return formatJSON(
+      {
+        id: uuid,
+        type: "textResponse",
+        sources: [],
+        close: true,
+        error: null,
+        textResponse,
+      },
+      { model: workspace.slug, finish_reason: "no_content" }
+    );
+  }
+
+  // Compress & Assemble message to ensure prompt passes token limit with room for response
+  // and build system messages based on inputs and history.
+  const messages = await LLMConnector.compressMessages({
+    systemPrompt: systemPrompt ?? chatPrompt(workspace),
+    userPrompt: prompt,
+    contextTexts,
+    chatHistory: history,
+  });
+
+  // Send the text completion.
+  const textResponse = await LLMConnector.getChatCompletion(messages, {
+    temperature:
+      temperature ?? workspace?.openAiTemp ?? LLMConnector.defaultTemp,
+  });
+
+  if (!textResponse) {
+    return formatJSON(
+      {
+        id: uuid,
+        type: "textResponse",
+        sources: [],
+        close: true,
+        error: "No text completion could be completed with this input.",
+        textResponse: null,
+      },
+      { model: workspace.slug, finish_reason: "no_content" }
+    );
+  }
+
+  const { chat } = await WorkspaceChats.new({
+    workspaceId: workspace.id,
+    prompt: prompt,
+    response: { text: textResponse, sources, type: chatMode },
+  });
+
+  return formatJSON(
+    {
+      id: uuid,
+      type: "textResponse",
+      close: true,
+      error: null,
+      chatId: chat.id,
+      textResponse,
+      sources,
+    },
+    { model: workspace.slug, finish_reason: "stop" }
+  );
+}
+
+async function streamChat({
+  workspace,
+  response,
+  systemPrompt = null,
+  history = [],
+  prompt = null,
+  temperature = null,
+}) {
+  const uuid = uuidv4();
+  const chatMode = workspace?.chatMode ?? "chat";
+  const LLMConnector = getLLMProvider({
+    provider: workspace?.chatProvider,
+    model: workspace?.chatModel,
+  });
+  const VectorDb = getVectorDbClass();
+  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
+  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
+
+  // We don't want to write a new method for every LLM to support openAI calls
+  // via the `handleStreamResponseV2` method handler. So here we create a passthrough
+  // that on writes to the main response, transforms the chunk to OpenAI format.
+  // The chunk is coming in the format from `writeResponseChunk` but in the AnythingLLM
+  // response chunk schema, so we here we mutate each chunk.
+  const responseInterceptor = new PassThrough({});
+  responseInterceptor.on("data", (chunk) => {
+    try {
+      const originalData = JSON.parse(chunk.toString().split("data: ")[1]);
+      const modified = formatJSON(originalData, {
+        chunked: true,
+        model: workspace.slug,
+      }); // rewrite to OpenAI format
+      response.write(`data: ${JSON.stringify(modified)}\n\n`);
+    } catch (e) {
+      console.error(e);
+    }
+  });
+
+  // User is trying to query-mode chat a workspace that has no data in it - so
+  // we should exit early as no information can be found under these conditions.
+  if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: String(prompt),
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+      },
+      include: false,
+    });
+
+    writeResponseChunk(
+      response,
+      formatJSON(
+        {
+          id: uuid,
+          type: "textResponse",
+          sources: [],
+          close: true,
+          error: null,
+          textResponse,
+        },
+        { chunked: true, model: workspace.slug, finish_reason: "abort" }
+      )
+    );
+    return;
+  }
+
+  // If we are here we know that we are in a workspace that is:
+  // 1. Chatting in "chat" mode and may or may _not_ have embeddings
+  // 2. Chatting in "query" mode and has at least 1 embedding
+  let contextTexts = [];
+  let sources = [];
+  let pinnedDocIdentifiers = [];
+  await new DocumentManager({
+    workspace,
+    maxTokens: LLMConnector.promptWindowLimit(),
+  })
+    .pinnedDocs()
+    .then((pinnedDocs) => {
+      pinnedDocs.forEach((doc) => {
+        const { pageContent, ...metadata } = doc;
+        pinnedDocIdentifiers.push(sourceIdentifier(doc));
+        contextTexts.push(doc.pageContent);
+        sources.push({
+          text:
+            pageContent.slice(0, 1_000) +
+            "...continued on in source document...",
+          ...metadata,
+        });
+      });
+    });
+
+  const vectorSearchResults =
+    embeddingsCount !== 0
+      ? await VectorDb.performSimilaritySearch({
+          namespace: workspace.slug,
+          input: prompt,
+          LLMConnector,
+          similarityThreshold: workspace?.similarityThreshold,
+          topN: workspace?.topN,
+          filterIdentifiers: pinnedDocIdentifiers,
+        })
+      : {
+          contextTexts: [],
+          sources: [],
+          message: null,
+        };
+
+  // Failed similarity search if it was run at all and failed.
+  if (!!vectorSearchResults.message) {
+    writeResponseChunk(
+      response,
+      formatJSON(
+        {
+          id: uuid,
+          type: "abort",
+          textResponse: null,
+          sources: [],
+          close: true,
+          error: vectorSearchResults.message,
+        },
+        { chunked: true, model: workspace.slug, finish_reason: "abort" }
+      )
+    );
+    return;
+  }
+
+  // For OpenAI Compatible chats, we cannot do backfilling so we simply aggregate results here.
+  contextTexts = [...contextTexts];
+  sources = [...sources];
+
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
+  // let the LLM try to hallucinate a response or use general knowledge and exit early
+  if (chatMode === "query" && contextTexts.length === 0) {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: prompt,
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+      },
+      include: false,
+    });
+
+    writeResponseChunk(
+      response,
+      formatJSON(
+        {
+          id: uuid,
+          type: "textResponse",
+          sources: [],
+          close: true,
+          error: null,
+          textResponse,
+        },
+        { chunked: true, model: workspace.slug, finish_reason: "no_content" }
+      )
+    );
+    return;
+  }
+
+  // Compress & Assemble message to ensure prompt passes token limit with room for response
+  // and build system messages based on inputs and history.
+  const messages = await LLMConnector.compressMessages({
+    systemPrompt: systemPrompt ?? chatPrompt(workspace),
+    userPrompt: prompt,
+    contextTexts,
+    chatHistory: history,
+  });
+
+  if (!LLMConnector.streamingEnabled()) {
+    writeResponseChunk(
+      response,
+      formatJSON(
+        {
+          id: uuid,
+          type: "textResponse",
+          sources: [],
+          close: true,
+          error: "Streaming is not available for the connected LLM Provider",
+          textResponse: null,
+        },
+        {
+          chunked: true,
+          model: workspace.slug,
+          finish_reason: "streaming_disabled",
+        }
+      )
+    );
+    return;
+  }
+
+  const stream = await LLMConnector.streamGetChatCompletion(messages, {
+    temperature:
+      temperature ?? workspace?.openAiTemp ?? LLMConnector.defaultTemp,
+  });
+  const completeText = await LLMConnector.handleStream(
+    responseInterceptor,
+    stream,
+    {
+      uuid,
+      sources,
+    }
+  );
+
+  if (completeText?.length > 0) {
+    const { chat } = await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: prompt,
+      response: { text: completeText, sources, type: chatMode },
+    });
+
+    writeResponseChunk(
+      response,
+      formatJSON(
+        {
+          uuid,
+          type: "finalizeResponseStream",
+          close: true,
+          error: false,
+          chatId: chat.id,
+          textResponse: "",
+        },
+        { chunked: true, model: workspace.slug, finish_reason: "stop" }
+      )
+    );
+    return;
+  }
+
+  writeResponseChunk(
+    response,
+    formatJSON(
+      {
+        uuid,
+        type: "finalizeResponseStream",
+        close: true,
+        error: false,
+        textResponse: "",
+      },
+      { chunked: true, model: workspace.slug, finish_reason: "stop" }
+    )
+  );
+  return;
+}
+
+function formatJSON(chat, { chunked = false, model, finish_reason = null }) {
+  const data = {
+    id: chat.uuid ?? chat.id,
+    object: "chat.completion",
+    created: Number(new Date()),
+    model: model,
+    choices: [
+      {
+        [chunked ? "delta" : "message"]: {
+          role: "assistant",
+          content: chat.textResponse,
+        },
+        logprobs: null,
+        finish_reason: finish_reason,
+      },
+    ],
+  };
+
+  return data;
+}
+
+module.exports.OpenAICompatibleChat = {
+  chatSync,
+  streamChat,
+};
--- a/server/utils/helpers/chat/responses.js
+++ b/server/utils/helpers/chat/responses.js
@ -61,109 +61,6 @@ function handleDefaultStreamResponseV2(response, stream, responseProps) {
  });
 }

-// TODO: Fully remove - deprecated.
-// The default way to handle a stream response. Functions best with OpenAI.
-// Currently used for LMStudio, LocalAI, Mistral API, and OpenAI
-function handleDefaultStreamResponse(response, stream, responseProps) {
-  const { uuid = uuidv4(), sources = [] } = responseProps;
-
-  return new Promise((resolve) => {
-    let fullText = "";
-    let chunk = "";
-
-    // Establish listener to early-abort a streaming response
-    // in case things go sideways or the user does not like the response.
-    // We preserve the generated text but continue as if chat was completed
-    // to preserve previously generated content.
-    const handleAbort = () => clientAbortedHandler(resolve, fullText);
-    response.on("close", handleAbort);
-
-    stream.data.on("data", (data) => {
-      const lines = data
-        ?.toString()
-        ?.split("\n")
-        .filter((line) => line.trim() !== "");
-
-      for (const line of lines) {
-        let validJSON = false;
-        const message = chunk + line.replace(/^data: /, "");
-
-        // JSON chunk is incomplete and has not ended yet
-        // so we need to stitch it together. You would think JSON
-        // chunks would only come complete - but they don't!
-        try {
-          JSON.parse(message);
-          validJSON = true;
-        } catch {}
-
-        if (!validJSON) {
-          // It can be possible that the chunk decoding is running away
-          // and the message chunk fails to append due to string length.
-          // In this case abort the chunk and reset so we can continue.
-          // ref: https://github.com/Mintplex-Labs/anything-llm/issues/416
-          try {
-            chunk += message;
-          } catch (e) {
-            console.error(`Chunk appending error`, e);
-            chunk = "";
-          }
-          continue;
-        } else {
-          chunk = "";
-        }
-
-        if (message == "[DONE]") {
-          writeResponseChunk(response, {
-            uuid,
-            sources,
-            type: "textResponseChunk",
-            textResponse: "",
-            close: true,
-            error: false,
-          });
-          response.removeListener("close", handleAbort);
-          resolve(fullText);
-        } else {
-          let finishReason = null;
-          let token = "";
-          try {
-            const json = JSON.parse(message);
-            token = json?.choices?.[0]?.delta?.content;
-            finishReason = json?.choices?.[0]?.finish_reason || null;
-          } catch {
-            continue;
-          }
-
-          if (token) {
-            fullText += token;
-            writeResponseChunk(response, {
-              uuid,
-              sources: [],
-              type: "textResponseChunk",
-              textResponse: token,
-              close: false,
-              error: false,
-            });
-          }
-
-          if (finishReason !== null) {
-            writeResponseChunk(response, {
-              uuid,
-              sources,
-              type: "textResponseChunk",
-              textResponse: "",
-              close: true,
-              error: false,
-            });
-            response.removeListener("close", handleAbort);
-            resolve(fullText);
-          }
-        }
-      }
-    });
-  });
-}
-
 function convertToChatHistory(history = []) {
  const formattedHistory = [];
  history.forEach((history) => {
@ -211,7 +108,6 @@ function writeResponseChunk(response, data) {

 module.exports = {
  handleDefaultStreamResponseV2,
-  handleDefaultStreamResponse,
  convertToChatHistory,
  convertToPromptHistory,
  writeResponseChunk,