Infinite prompt input and compression implementation (#332)

* WIP on continuous prompt window summary * wip * Move chat out of VDB simplify chat interface normalize LLM model interface have compression abstraction Cleanup compressor TODO: Anthropic stuff * Implement compression for Anythropic Fix lancedb sources * cleanup vectorDBs and check that lance, chroma, and pinecone are returning valid metadata sources * Resolve Weaviate citation sources not working with schema * comment cleanup
2024-11-05 06:20:10 +01:00 · 2023-11-06 13:13:53 -08:00 · 2023-11-06 13:13:53 -08:00 · be9d8b0397
commit be9d8b0397
parent 0751fb1fdd
23 changed files with 837 additions and 445 deletions
--- a/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
@ -24,7 +24,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) {
      <div className="w-full flex items-center gap-4">
        <div className="flex flex-col w-60">
          <label className="text-white text-sm font-semibold block mb-4">
-            Anthropic Claude-2 API Key
+            Anthropic API Key
          </label>
          <input
            type="password"
@ -48,7 +48,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) {
            required={true}
            className="bg-zinc-900 border border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
          >
-            {["claude-2"].map((model) => {
+            {["claude-2", "claude-instant-1"].map((model) => {
              return (
                <option key={model} value={model}>
                  {model}
--- a/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx
@ -49,6 +49,23 @@ export default function AzureAiOptions({ settings }) {
        />
      </div>

+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Chat Model Token Limit
+        </label>
+        <select
+          name="AzureOpenAiTokenLimit"
+          defaultValue={settings?.AzureOpenAiTokenLimit || 4096}
+          className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          required={true}
+        >
+          <option value={4096}>4,096 (gpt-3.5-turbo)</option>
+          <option value={16384}>16,384 (gpt-3.5-16k)</option>
+          <option value={8192}>8,192 (gpt-4)</option>
+          <option value={32768}>32,768 (gpt-4-32k)</option>
+        </select>
+      </div>
+
      <div className="flex flex-col w-60">
        <label className="text-white text-sm font-semibold block mb-4">
          Embedding Deployment Name
--- a/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx
@ -224,7 +224,6 @@ export default function WorkspaceSettings({ workspace }) {
                </div>
                <textarea
                  name="openAiPrompt"
-                  maxLength={500}
                  rows={5}
                  defaultValue={chatPrompt(workspace)}
                  className="bg-zinc-900 text-white text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5"
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
@ -55,7 +55,6 @@ export default function PromptInput({
                onKeyDown={captureEnter}
                onChange={onChange}
                required={true}
-                maxLength={240}
                disabled={inputDisabled}
                onFocus={() => setFocused(true)}
                onBlur={(e) => {
--- a/server/endpoints/chat.js
+++ b/server/endpoints/chat.js
@ -71,6 +71,7 @@ function chatEndpoints(app) {
        });
        response.status(200).json({ ...result });
      } catch (e) {
+        console.error(e);
        response.status(500).json({
          id: uuidv4(),
          type: "abort",
--- a/server/models/cacheData.js
+++ b/server/models/cacheData.js
@ -0,0 +1,69 @@
+const prisma = require("../utils/prisma");
+
+const CacheData = {
+  new: async function (inputs = {}) {
+    try {
+      const cache = await prisma.cache_data.create({
+        data: inputs,
+      });
+      return { cache, message: null };
+    } catch (error) {
+      console.error(error.message);
+      return { cache: null, message: error.message };
+    }
+  },
+
+  get: async function (clause = {}, limit = null, orderBy = null) {
+    try {
+      const cache = await prisma.cache_data.findFirst({
+        where: clause,
+        ...(limit !== null ? { take: limit } : {}),
+        ...(orderBy !== null ? { orderBy } : {}),
+      });
+      return cache || null;
+    } catch (error) {
+      console.error(error.message);
+      return null;
+    }
+  },
+
+  delete: async function (clause = {}) {
+    try {
+      await prisma.cache_data.deleteMany({
+        where: clause,
+      });
+      return true;
+    } catch (error) {
+      console.error(error.message);
+      return false;
+    }
+  },
+
+  where: async function (clause = {}, limit = null, orderBy = null) {
+    try {
+      const caches = await prisma.cache_data.findMany({
+        where: clause,
+        ...(limit !== null ? { take: limit } : {}),
+        ...(orderBy !== null ? { orderBy } : {}),
+      });
+      return caches;
+    } catch (error) {
+      console.error(error.message);
+      return [];
+    }
+  },
+
+  count: async function (clause = {}) {
+    try {
+      const count = await prisma.cache_data.count({
+        where: clause,
+      });
+      return count;
+    } catch (error) {
+      console.error(error.message);
+      return 0;
+    }
+  },
+};
+
+module.exports = { CacheData };
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -65,6 +65,7 @@ const SystemSettings = {
            AzureOpenAiKey: !!process.env.AZURE_OPENAI_KEY,
            AzureOpenAiModelPref: process.env.OPEN_MODEL_PREF,
            AzureOpenAiEmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
+            AzureOpenAiTokenLimit: process.env.AZURE_OPENAI_TOKEN_LIMIT || 4096,
          }
        : {}),

--- a/server/package.json
+++ b/server/package.json
@ -36,6 +36,7 @@
    "express": "^4.18.2",
    "extract-zip": "^2.0.1",
    "graphql": "^16.7.1",
+    "js-tiktoken": "^1.0.7",
    "jsonwebtoken": "^8.5.1",
    "langchain": "^0.0.90",
    "mime": "^3.0.0",
--- a/server/prisma/migrations/20231101195421_init/migration.sql
+++ b/server/prisma/migrations/20231101195421_init/migration.sql
@ -0,0 +1,11 @@
+-- CreateTable
+CREATE TABLE "cache_data" (
+    "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    "name" TEXT NOT NULL,
+    "data" TEXT NOT NULL,
+    "belongsTo" TEXT,
+    "byId" INTEGER,
+    "expiresAt" DATETIME,
+    "createdAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "lastUpdatedAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
--- a/server/prisma/schema.prisma
+++ b/server/prisma/schema.prisma
@ -116,3 +116,14 @@ model workspace_users {
  workspaces    workspaces @relation(fields: [workspace_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
  users         users      @relation(fields: [user_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
 }
+
+model cache_data {
+  id            Int       @id @default(autoincrement())
+  name          String
+  data          String
+  belongsTo     String?
+  byId          Int?
+  expiresAt     DateTime?
+  createdAt     DateTime  @default(now())
+  lastUpdatedAt DateTime  @default(now())
+}
--- a/server/utils/AiProviders/anthropic/index.js
+++ b/server/utils/AiProviders/anthropic/index.js
@ -12,6 +12,12 @@ class AnthropicLLM {
      apiKey: process.env.ANTHROPIC_API_KEY,
    });
    this.anthropic = anthropic;
+    this.model = process.env.ANTHROPIC_MODEL_PREF;
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };

    if (!embedder)
      throw new Error(
@ -21,8 +27,19 @@ class AnthropicLLM {
    this.answerKey = v4().split("-")[0];
  }

-  isValidChatModel(modelName = "") {
-    const validModels = ["claude-2"];
+  promptWindowLimit() {
+    switch (this.model) {
+      case "claude-instant-1":
+        return 72_000;
+      case "claude-2":
+        return 100_000;
+      default:
+        return 72_000; // assume a claude-instant-1 model
+    }
+  }
+
+  isValidChatCompletionModel(modelName = "") {
+    const validModels = ["claude-2", "claude-instant-1"];
    return validModels.includes(modelName);
  }

@ -62,24 +79,25 @@ class AnthropicLLM {
    \n\nAssistant:`;
  }

-  // This is the interface used when no embeddings are present in the workspace
-  // This is just having a conversation with the LLM as one would normally.
-  async sendChat(chatHistory = [], prompt, workspace = {}) {
-    const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2";
-    if (!this.isValidChatModel(model))
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
+    if (!this.isValidChatCompletionModel(this.model))
      throw new Error(
-        `Anthropic chat: ${model} is not valid for chat completion!`
+        `Anthropic chat: ${this.model} is not valid for chat completion!`
      );

-    const { content, error } = await this.anthropic.completions
-      .create({
-        model: "claude-2",
-        max_tokens_to_sample: 300,
-        prompt: this.constructPrompt({
+    const compressedPrompt = await this.compressMessages(
+      {
        systemPrompt: chatPrompt(workspace),
        userPrompt: prompt,
        chatHistory,
-        }),
+      },
+      rawHistory
+    );
+    const { content, error } = await this.anthropic.completions
+      .create({
+        model: this.model,
+        max_tokens_to_sample: 300,
+        prompt: compressedPrompt,
      })
      .then((res) => {
        const { completion } = res;
@ -100,15 +118,14 @@ class AnthropicLLM {
  }

  async getChatCompletion(prompt = "", _opts = {}) {
-    const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2";
-    if (!this.isValidChatModel(model))
+    if (!this.isValidChatCompletionModel(this.model))
      throw new Error(
-        `Anthropic chat: ${model} is not valid for chat completion!`
+        `Anthropic chat: ${this.model} is not valid for chat completion!`
      );

    const { content, error } = await this.anthropic.completions
      .create({
-        model: "claude-2",
+        model: this.model,
        max_tokens_to_sample: 300,
        prompt,
      })
@ -130,6 +147,16 @@ class AnthropicLLM {
    return content;
  }

+  async compressMessages(promptArgs = {}, rawHistory = []) {
+    const { messageStringCompressor } = require("../../helpers/chat");
+    const compressedPrompt = await messageStringCompressor(
+      this,
+      promptArgs,
+      rawHistory
+    );
+    return compressedPrompt;
+  }
+
  // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
  async embedTextInput(textInput) {
    return await this.embedder.embedTextInput(textInput);
--- a/server/utils/AiProviders/azureOpenAi/index.js
+++ b/server/utils/AiProviders/azureOpenAi/index.js
@ -1,4 +1,5 @@
 const { AzureOpenAiEmbedder } = require("../../EmbeddingEngines/azureOpenAi");
+const { chatPrompt } = require("../../chats");

 class AzureOpenAiLLM extends AzureOpenAiEmbedder {
  constructor() {
@ -13,9 +14,24 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
      process.env.AZURE_OPENAI_ENDPOINT,
      new AzureKeyCredential(process.env.AZURE_OPENAI_KEY)
    );
+    this.model = process.env.OPEN_MODEL_PREF;
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };
  }

-  isValidChatModel(_modelName = "") {
+  // Sure the user selected a proper value for the token limit
+  // could be any of these https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-models
+  // and if undefined - assume it is the lowest end.
+  promptWindowLimit() {
+    return !!process.env.AZURE_OPENAI_TOKEN_LIMIT
+      ? Number(process.env.AZURE_OPENAI_TOKEN_LIMIT)
+      : 4096;
+  }
+
+  isValidChatCompletionModel(_modelName = "") {
    // The Azure user names their "models" as deployments and they can be any name
    // so we rely on the user to put in the correct deployment as only they would
    // know it.
@ -31,7 +47,7 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
    const prompt = {
      role: "system",
      content: `${systemPrompt}
-    Context:
+Context:
    ${contextTexts
      .map((text, i) => {
        return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
@ -46,26 +62,25 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
    return { safe: true, reasons: [] };
  }

-  async sendChat(chatHistory = [], prompt, workspace = {}) {
-    const model = process.env.OPEN_MODEL_PREF;
-    if (!model)
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
+    if (!this.model)
      throw new Error(
        "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5."
      );

-    const textResponse = await this.openai
-      .getChatCompletions(
-        model,
-        [
-          { role: "system", content: "" },
-          ...chatHistory,
-          { role: "user", content: prompt },
-        ],
+    const messages = await this.compressMessages(
      {
+        systemPrompt: chatPrompt(workspace),
+        userPrompt: prompt,
+        chatHistory,
+      },
+      rawHistory
+    );
+    const textResponse = await this.openai
+      .getChatCompletions(this.model, messages, {
        temperature: Number(workspace?.openAiTemp ?? 0.7),
        n: 1,
-        }
-      )
+      })
      .then((res) => {
        if (!res.hasOwnProperty("choices"))
          throw new Error("OpenAI chat: No results!");
@ -83,18 +98,23 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
  }

  async getChatCompletion(messages = [], { temperature = 0.7 }) {
-    const model = process.env.OPEN_MODEL_PREF;
-    if (!model)
+    if (!this.model)
      throw new Error(
        "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5."
      );

-    const data = await this.openai.getChatCompletions(model, messages, {
+    const data = await this.openai.getChatCompletions(this.model, messages, {
      temperature,
    });
    if (!data.hasOwnProperty("choices")) return null;
    return data.choices[0].message.content;
  }
+
+  async compressMessages(promptArgs = {}, rawHistory = []) {
+    const { messageArrayCompressor } = require("../../helpers/chat");
+    const messageArray = this.constructPrompt(promptArgs);
+    return await messageArrayCompressor(this, messageArray, rawHistory);
+  }
 }

 module.exports = {
--- a/server/utils/AiProviders/openAi/index.js
+++ b/server/utils/AiProviders/openAi/index.js
@ -1,4 +1,5 @@
 const { OpenAiEmbedder } = require("../../EmbeddingEngines/openAi");
+const { chatPrompt } = require("../../chats");

 class OpenAiLLM extends OpenAiEmbedder {
  constructor() {
@ -10,6 +11,23 @@ class OpenAiLLM extends OpenAiEmbedder {
      apiKey: process.env.OPEN_AI_KEY,
    });
    this.openai = new OpenAIApi(config);
+    this.model = process.env.OPEN_MODEL_PREF;
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };
+  }
+
+  promptWindowLimit() {
+    switch (this.model) {
+      case "gpt-3.5-turbo":
+        return 4096;
+      case "gpt-4":
+        return 8192;
+      default:
+        return 4096; // assume a fine-tune 3.5
+    }
  }

  async isValidChatCompletionModel(modelName = "") {
@ -33,7 +51,7 @@ class OpenAiLLM extends OpenAiEmbedder {
    const prompt = {
      role: "system",
      content: `${systemPrompt}
-    Context:
+Context:
    ${contextTexts
      .map((text, i) => {
        return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
@ -75,7 +93,7 @@ class OpenAiLLM extends OpenAiEmbedder {
    return { safe: false, reasons };
  }

-  async sendChat(chatHistory = [], prompt, workspace = {}) {
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
    const model = process.env.OPEN_MODEL_PREF;
    if (!(await this.isValidChatCompletionModel(model)))
      throw new Error(
@ -87,11 +105,14 @@ class OpenAiLLM extends OpenAiEmbedder {
        model,
        temperature: Number(workspace?.openAiTemp ?? 0.7),
        n: 1,
-        messages: [
-          { role: "system", content: "" },
-          ...chatHistory,
-          { role: "user", content: prompt },
-        ],
+        messages: await this.compressMessages(
+          {
+            systemPrompt: chatPrompt(workspace),
+            userPrompt: prompt,
+            chatHistory,
+          },
+          rawHistory
+        ),
      })
      .then((json) => {
        const res = json.data;
@ -111,14 +132,13 @@ class OpenAiLLM extends OpenAiEmbedder {
  }

  async getChatCompletion(messages = null, { temperature = 0.7 }) {
-    const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
-    if (!(await this.isValidChatCompletionModel(model)))
+    if (!(await this.isValidChatCompletionModel(this.model)))
      throw new Error(
-        `OpenAI chat: ${model} is not valid for chat completion!`
+        `OpenAI chat: ${this.model} is not valid for chat completion!`
      );

    const { data } = await this.openai.createChatCompletion({
-      model,
+      model: this.model,
      messages,
      temperature,
    });
@ -126,6 +146,12 @@ class OpenAiLLM extends OpenAiEmbedder {
    if (!data.hasOwnProperty("choices")) return null;
    return data.choices[0].message.content;
  }
+
+  async compressMessages(promptArgs = {}, rawHistory = []) {
+    const { messageArrayCompressor } = require("../../helpers/chat");
+    const messageArray = this.constructPrompt(promptArgs);
+    return await messageArrayCompressor(this, messageArray, rawHistory);
+  }
 }

 module.exports = {
--- a/server/utils/chats/index.js
+++ b/server/utils/chats/index.js
@ -91,65 +91,35 @@ async function chatWithWorkspace(
  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
  if (!hasVectorizedSpace || embeddingsCount === 0) {
-    const rawHistory = (
-      user
-        ? await WorkspaceChats.forWorkspaceByUser(
-            workspace.id,
-            user.id,
-            messageLimit,
-            { id: "desc" }
-          )
-        : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
-            id: "desc",
-          })
-    ).reverse();
-    const chatHistory = convertToPromptHistory(rawHistory);
-    const response = await LLMConnector.sendChat(
-      chatHistory,
-      message,
-      workspace
-    );
-    const data = { text: response, sources: [], type: "chat" };
-
-    await WorkspaceChats.new({
-      workspaceId: workspace.id,
-      prompt: message,
-      response: data,
+    // If there are no embeddings - chat like a normal LLM chat interface.
+    return await emptyEmbeddingChat({
+      uuid,
      user,
-    });
-    return {
-      id: uuid,
-      type: "textResponse",
-      textResponse: response,
-      sources: [],
-      close: true,
-      error: null,
-    };
-  } else {
-    const rawHistory = (
-      user
-        ? await WorkspaceChats.forWorkspaceByUser(
-            workspace.id,
-            user.id,
+      message,
+      workspace,
      messageLimit,
-            { id: "desc" }
-          )
-        : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
-            id: "desc",
-          })
-    ).reverse();
-    const chatHistory = convertToPromptHistory(rawHistory);
+      LLMConnector,
+    });
+  }
+
+  const { rawHistory, chatHistory } = await recentChatHistory(
+    user,
+    workspace,
+    messageLimit,
+    chatMode
+  );
  const {
-      response,
-      sources,
+    contextTexts = [],
+    sources = [],
    message: error,
-    } = await VectorDb[chatMode]({
+  } = await VectorDb.performSimilaritySearch({
    namespace: workspace.slug,
    input: message,
-      workspace,
-      chatHistory,
+    LLMConnector,
  });
-    if (!response) {
+
+  // Failed similarity search.
+  if (!!error) {
    return {
      id: uuid,
      type: "abort",
@ -160,22 +130,107 @@ async function chatWithWorkspace(
    };
  }

-    const data = { text: response, sources, type: chatMode };
+  // Compress message to ensure prompt passes token limit with room for response
+  // and build system messages based on inputs and history.
+  const messages = await LLMConnector.compressMessages(
+    {
+      systemPrompt: chatPrompt(workspace),
+      userPrompt: message,
+      contextTexts,
+      chatHistory,
+    },
+    rawHistory
+  );
+
+  // Send the text completion.
+  const textResponse = await LLMConnector.getChatCompletion(messages, {
+    temperature: workspace?.openAiTemp ?? 0.7,
+  });
+
+  if (!textResponse) {
+    return {
+      id: uuid,
+      type: "abort",
+      textResponse: null,
+      sources: [],
+      close: true,
+      error: "No text completion could be completed with this input.",
+    };
+  }
+
  await WorkspaceChats.new({
    workspaceId: workspace.id,
    prompt: message,
-      response: data,
+    response: { text: textResponse, sources, type: chatMode },
    user,
  });
  return {
    id: uuid,
    type: "textResponse",
-      textResponse: response,
-      sources,
    close: true,
+    textResponse,
+    sources,
    error,
  };
-  }
+}
+
+// On query we dont return message history. All other chat modes and when chatting
+// with no embeddings we return history.
+async function recentChatHistory(
+  user = null,
+  workspace,
+  messageLimit = 20,
+  chatMode = null
+) {
+  if (chatMode === "query") return [];
+  const rawHistory = (
+    user
+      ? await WorkspaceChats.forWorkspaceByUser(
+          workspace.id,
+          user.id,
+          messageLimit,
+          { id: "desc" }
+        )
+      : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
+          id: "desc",
+        })
+  ).reverse();
+  return { rawHistory, chatHistory: convertToPromptHistory(rawHistory) };
+}
+
+async function emptyEmbeddingChat({
+  uuid,
+  user,
+  message,
+  workspace,
+  messageLimit,
+  LLMConnector,
+}) {
+  const { rawHistory, chatHistory } = await recentChatHistory(
+    user,
+    workspace,
+    messageLimit
+  );
+  const textResponse = await LLMConnector.sendChat(
+    chatHistory,
+    message,
+    workspace,
+    rawHistory
+  );
+  await WorkspaceChats.new({
+    workspaceId: workspace.id,
+    prompt: message,
+    response: { text: textResponse, sources: [], type: "chat" },
+    user,
+  });
+  return {
+    id: uuid,
+    type: "textResponse",
+    sources: [],
+    close: true,
+    error: null,
+    textResponse,
+  };
 }

 function chatPrompt(workspace) {
@ -186,6 +241,7 @@ function chatPrompt(workspace) {
 }

 module.exports = {
+  convertToPromptHistory,
  convertToChatHistory,
  chatWithWorkspace,
  chatPrompt,
--- a/server/utils/helpers/chat/index.js
+++ b/server/utils/helpers/chat/index.js
@ -0,0 +1,325 @@
+const { convertToPromptHistory } = require("../../chats");
+const { TokenManager } = require("../tiktoken");
+
+/*
+What is the message Array compressor?
+TLDR: So anyway, i started blasting (your prompts & stuff)
+
+messageArrayCompressor arose out of a need for users to be able to insert unlimited token prompts
+and also maintain coherent history, system instructions and context, if applicable.
+
+We took an opinionated approach that after much back-testing we have found retained a highly coherent answer
+under most user conditions that a user would take while using this specific system. While other systems may
+use a more advanced model for compressing message history or simplify text through a recursive approach - our is much more simple.
+
+We "cannonball" the input.
+Cannonball (verb): To ensure a prompt fits through a model window we blast a hole in the center of any inputs blocking our path to doing so.
+This starts by dissecting the input as tokens and delete from the middle-out bi-directionally until the prompt window is satisfied.
+You may think: "Doesn't this result in massive data loss?" - yes & no.
+Under the use cases we expect the tool to be used, which is mostly chatting with documents, we are able to use this approach with minimal blowback
+on the quality of responses.
+
+We accomplish this by taking a rate-limit approach that is proportional to the model capacity. Since we support more than openAI models, this needs to 
+be generic and reliance on a "better summary" model just is not a luxury we can afford. The added latency overhead during prompting is also unacceptable.
+In general:
+  system: at best 15% of token capacity
+  history: at best 15% of token capacity
+  prompt: at best 70% of token capacity.
+
+we handle overflows by taking an aggressive path for two main cases.
+
+1. Very large user prompt
+- Likely uninterested in context, history, or even system prompt. This is a "standalone" prompt that highjacks the whole thread.
+- We run this prompt on its own since a prompt that is over 70% of context window certainly is standalone.
+
+2. Context window is exceeded in regular use.
+- We do not touch prompt since it is very likely to be <70% of window.
+- We check system prompt is not outrageous - if it is we cannonball it and keep context if present.
+- We check a sliding window of history, only allowing up to 15% of the history to pass through if it fits, with a 
+preference for recent history if we can cannonball to fit it, otherwise it is omitted.
+
+We end up with a rather large prompt that fits through a given window with a lot of room for response in most use-cases.
+We also take the approach that history is the least important and most flexible of the items in this array of responses.
+
+There is a supplemental version of this function that also returns a formatted string for models like Claude-2
+*/
+
+async function messageArrayCompressor(llm, messages = [], rawHistory = []) {
+  // assume the response will be at least 600 tokens. If the total prompt + reply is over we need to proactively
+  // run the compressor to ensure the prompt has enough space to reply.
+  // realistically - most users will not be impacted by this.
+  const tokenBuffer = 600;
+  const tokenManager = new TokenManager(llm.model);
+  // If no work needs to be done, just pass through.
+  if (tokenManager.statsFrom(messages) + tokenBuffer < llm.promptWindowLimit())
+    return messages;
+
+  const system = messages.shift();
+  const user = messages.pop();
+  const userPromptSize = tokenManager.countFromString(user.content);
+
+  // User prompt is the main focus here - we we prioritize it and allow
+  // it to highjack the entire conversation thread. We are going to
+  // cannonball the prompt through to ensure the reply has at least 20% of
+  // the token supply to reply with.
+  if (userPromptSize > llm.limits.user) {
+    return [
+      {
+        role: "user",
+        content: cannonball({
+          input: user.content,
+          targetTokenSize: llm.promptWindowLimit() * 0.8,
+          tiktokenInstance: tokenManager,
+        }),
+      },
+    ];
+  }
+
+  const compressedSystem = new Promise(async (resolve) => {
+    const count = tokenManager.countFromString(system.content);
+    if (count < llm.limits.system) {
+      resolve(system);
+      return;
+    }
+
+    // Split context from system prompt - cannonball since its over the window.
+    // We assume the context + user prompt is enough tokens to fit.
+    const [prompt, context = ""] = system.content.split("Context:");
+    system.content = `${cannonball({
+      input: prompt,
+      targetTokenSize: llm.limits.system,
+      tiktokenInstance: tokenManager,
+    })}${context ? `\nContext: ${context}` : ""}`;
+    resolve(system);
+  });
+
+  // Prompt is allowed to take up to 70% of window - we know its under
+  // if we are here, so passthrough.
+  const compressedPrompt = new Promise(async (resolve) => resolve(user));
+
+  // We always aggressively compress history because it is the least
+  // important data to retain in full-fidelity.
+  const compressedHistory = new Promise((resolve) => {
+    const eligibleHistoryItems = [];
+    var historyTokenCount = 0;
+
+    for (const [i, history] of rawHistory.reverse().entries()) {
+      const [user, assistant] = convertToPromptHistory([history]);
+      const [userTokens, assistantTokens] = [
+        tokenManager.countFromString(user.content),
+        tokenManager.countFromString(assistant.content),
+      ];
+      const total = userTokens + assistantTokens;
+
+      // If during the loop the token cost of adding this history
+      // is small, we can add it to history and move onto next.
+      if (historyTokenCount + total < llm.limits.history) {
+        eligibleHistoryItems.unshift(user, assistant);
+        historyTokenCount += total;
+        continue;
+      }
+
+      // If we reach here the overhead of adding this history item will
+      // be too much of the limit. So now, we are prioritizing
+      // the most recent 3 message pairs - if we are already past those - exit loop and stop
+      // trying to make history work.
+      if (i > 2) break;
+
+      // We are over the limit and we are within the first 3 most recent chats.
+      // so now we cannonball them to make them fit into the window.
+      // max size = llm.limit.history; Each component of the message, can at most
+      // be 50% of the history. We cannonball whichever is the problem.
+      // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
+      const maxTargetSize = Math.floor(llm.limits.history / 2.2);
+      if (userTokens > maxTargetSize) {
+        user.content = cannonball({
+          input: user.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      if (assistantTokens > maxTargetSize) {
+        assistant.content = cannonball({
+          input: assistant.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      const newTotal = tokenManager.statsFrom([user, assistant]);
+      if (historyTokenCount + newTotal > llm.limits.history) continue;
+      eligibleHistoryItems.unshift(user, assistant);
+      historyTokenCount += newTotal;
+    }
+    resolve(eligibleHistoryItems);
+  });
+
+  const [cSystem, cHistory, cPrompt] = await Promise.all([
+    compressedSystem,
+    compressedHistory,
+    compressedPrompt,
+  ]);
+  return [cSystem, ...cHistory, cPrompt];
+}
+
+// Implementation of messageArrayCompressor, but for string only completion models
+async function messageStringCompressor(llm, promptArgs = {}, rawHistory = []) {
+  const tokenBuffer = 600;
+  const tokenManager = new TokenManager(llm.model);
+  const initialPrompt = llm.constructPrompt(promptArgs);
+  if (
+    tokenManager.statsFrom(initialPrompt) + tokenBuffer <
+    llm.promptWindowLimit()
+  )
+    return initialPrompt;
+
+  const system = promptArgs.systemPrompt;
+  const user = promptArgs.userPrompt;
+  const userPromptSize = tokenManager.countFromString(user);
+
+  // User prompt is the main focus here - we we prioritize it and allow
+  // it to highjack the entire conversation thread. We are going to
+  // cannonball the prompt through to ensure the reply has at least 20% of
+  // the token supply to reply with.
+  if (userPromptSize > llm.limits.user) {
+    return llm.constructPrompt({
+      userPrompt: cannonball({
+        input: user,
+        targetTokenSize: llm.promptWindowLimit() * 0.8,
+        tiktokenInstance: tokenManager,
+      }),
+    });
+  }
+
+  const compressedSystem = new Promise(async (resolve) => {
+    const count = tokenManager.countFromString(system);
+    if (count < llm.limits.system) {
+      resolve(system);
+      return;
+    }
+    resolve(
+      cannonball({
+        input: system,
+        targetTokenSize: llm.limits.system,
+        tiktokenInstance: tokenManager,
+      })
+    );
+  });
+
+  // Prompt is allowed to take up to 70% of window - we know its under
+  // if we are here, so passthrough.
+  const compressedPrompt = new Promise(async (resolve) => resolve(user));
+
+  // We always aggressively compress history because it is the least
+  // important data to retain in full-fidelity.
+  const compressedHistory = new Promise((resolve) => {
+    const eligibleHistoryItems = [];
+    var historyTokenCount = 0;
+
+    for (const [i, history] of rawHistory.reverse().entries()) {
+      const [user, assistant] = convertToPromptHistory([history]);
+      const [userTokens, assistantTokens] = [
+        tokenManager.countFromString(user.content),
+        tokenManager.countFromString(assistant.content),
+      ];
+      const total = userTokens + assistantTokens;
+
+      // If during the loop the token cost of adding this history
+      // is small, we can add it to history and move onto next.
+      if (historyTokenCount + total < llm.limits.history) {
+        eligibleHistoryItems.unshift(user, assistant);
+        historyTokenCount += total;
+        continue;
+      }
+
+      // If we reach here the overhead of adding this history item will
+      // be too much of the limit. So now, we are prioritizing
+      // the most recent 3 message pairs - if we are already past those - exit loop and stop
+      // trying to make history work.
+      if (i > 2) break;
+
+      // We are over the limit and we are within the first 3 most recent chats.
+      // so now we cannonball them to make them fit into the window.
+      // max size = llm.limit.history; Each component of the message, can at most
+      // be 50% of the history. We cannonball whichever is the problem.
+      // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
+      const maxTargetSize = Math.floor(llm.limits.history / 2.2);
+      if (userTokens > maxTargetSize) {
+        user.content = cannonball({
+          input: user.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      if (assistantTokens > maxTargetSize) {
+        assistant.content = cannonball({
+          input: assistant.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      const newTotal = tokenManager.statsFrom([user, assistant]);
+      if (historyTokenCount + newTotal > llm.limits.history) continue;
+      eligibleHistoryItems.unshift(user, assistant);
+      historyTokenCount += newTotal;
+    }
+    resolve(eligibleHistoryItems);
+  });
+
+  const [cSystem, cHistory, cPrompt] = await Promise.all([
+    compressedSystem,
+    compressedHistory,
+    compressedPrompt,
+  ]);
+
+  return llm.constructPrompt({
+    systemPrompt: cSystem,
+    contextTexts: promptArgs?.contextTexts || [],
+    chatHistory: cHistory,
+    userPrompt: cPrompt,
+  });
+}
+
+// Cannonball prompting: aka where we shoot a proportionally big cannonball through a proportional large prompt
+// Nobody should be sending prompts this big, but there is no reason we shouldn't allow it if results are good even by doing it.
+function cannonball({
+  input = "",
+  targetTokenSize = 0,
+  tiktokenInstance = null,
+  ellipsesStr = null,
+}) {
+  if (!input || !targetTokenSize) return input;
+  const tokenManager = tiktokenInstance || new TokenManager();
+  const truncText = ellipsesStr || "\n\n--prompt truncated for brevity--\n\n";
+  const initialInputSize = tokenManager.countFromString(input);
+  if (initialInputSize < targetTokenSize) return input;
+
+  // if the delta is the token difference between where our prompt is in size
+  // and where we ideally need to land.
+  const delta = initialInputSize - targetTokenSize;
+  const tokenChunks = tokenManager.tokensFromString(input);
+  const middleIdx = Math.floor(tokenChunks.length / 2);
+
+  // middle truncate the text going left and right of midpoint
+  const leftChunks = tokenChunks.slice(0, middleIdx - Math.round(delta / 2));
+  const rightChunks = tokenChunks.slice(middleIdx + Math.round(delta / 2));
+  const truncatedText =
+    tokenManager.bytesFromTokens(leftChunks) +
+    truncText +
+    tokenManager.bytesFromTokens(rightChunks);
+
+  console.log(
+    `Cannonball results ${initialInputSize} -> ${tokenManager.countFromString(
+      truncatedText
+    )} tokens.`
+  );
+  return truncatedText;
+}
+
+module.exports = {
+  messageArrayCompressor,
+  messageStringCompressor,
+};
--- a/server/utils/helpers/tiktoken.js
+++ b/server/utils/helpers/tiktoken.js
@ -0,0 +1,57 @@
+const { getEncodingNameForModel, getEncoding } = require("js-tiktoken");
+
+class TokenManager {
+  constructor(model = "gpt-3.5-turbo") {
+    this.model = model;
+    this.encoderName = this.getEncodingFromModel(model);
+    this.encoder = getEncoding(this.encoderName);
+    this.buffer = 50;
+  }
+
+  getEncodingFromModel(model) {
+    try {
+      return getEncodingNameForModel(model);
+    } catch {
+      return "cl100k_base";
+    }
+  }
+
+  tokensFromString(input = "") {
+    const tokens = this.encoder.encode(input);
+    return tokens;
+  }
+
+  bytesFromTokens(tokens = []) {
+    const bytes = this.encoder.decode(tokens);
+    return bytes;
+  }
+
+  countFromString(input = "") {
+    const tokens = this.encoder.encode(input);
+    return tokens.length;
+  }
+
+  statsFrom(input) {
+    if (typeof input === "string") return this.countFromString(input);
+
+    // What is going on here?
+    // https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb Item 6.
+    // The only option is to estimate. From repeated testing using the static values in the code we are always 2 off,
+    // which means as of Nov 1, 2023 the additional factor on ln: 476 changed from 3 to 5.
+    if (Array.isArray(input)) {
+      const perMessageFactorTokens = input.length * 3;
+      const tokensFromContent = input.reduce(
+        (a, b) => a + this.countFromString(b.content),
+        0
+      );
+      const diffCoefficient = 5;
+      return perMessageFactorTokens + tokensFromContent + diffCoefficient;
+    }
+
+    throw new Error("Not a supported tokenized format.");
+  }
+}
+
+module.exports = {
+  TokenManager,
+};
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -17,6 +17,10 @@ const KEY_MAPPING = {
    envKey: "AZURE_OPENAI_ENDPOINT",
    checks: [isNotEmpty, validAzureURL],
  },
+  AzureOpenAiTokenLimit: {
+    envKey: "AZURE_OPENAI_TOKEN_LIMIT",
+    checks: [validOpenAiTokenLimit],
+  },
  AzureOpenAiKey: {
    envKey: "AZURE_OPENAI_KEY",
    checks: [isNotEmpty],
@ -137,7 +141,7 @@ function supportedLLM(input = "") {
 }

 function validAnthropicModel(input = "") {
-  const validModels = ["claude-2"];
+  const validModels = ["claude-2", "claude-instant-1"];
  return validModels.includes(input)
    ? null
    : `Invalid Model type. Must be one of ${validModels.join(", ")}.`;
@ -174,6 +178,14 @@ function validAzureURL(input = "") {
  }
 }

+function validOpenAiTokenLimit(input = "") {
+  const tokenLimit = Number(input);
+  if (isNaN(tokenLimit)) return "Token limit is not a number";
+  if (![4_096, 16_384, 8_192, 32_768].includes(tokenLimit))
+    return "Invalid OpenAI token limit.";
+  return null;
+}
+
 function requiresForceMode(_, forceModeEnabled = false) {
  return forceModeEnabled === true ? null : "Cannot set this setting.";
 }
--- a/server/utils/vectorDbProviders/chroma/index.js
+++ b/server/utils/vectorDbProviders/chroma/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");

 const Chroma = {
  name: "Chroma",
@ -253,92 +252,35 @@ const Chroma = {
    await DocumentVectors.deleteIds(indexes);
    return true;
  },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
-
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    // When we roll out own response we have separate metadata and texts,
-    // so for source collection we need to combine them.
-    const sources = sourceDocuments.map((metadata, i) => {
-      return { metadata: { ...metadata, text: contextTexts[i] } };
-    });
-    return {
-      response: responseText,
-      sources: this.curateSources(sources),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
+  performSimilaritySearch: async function ({
    namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");

    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }

-    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });

-    // When we roll out own response we have separate metadata and texts,
-    // so for source collection we need to combine them.
    const sources = sourceDocuments.map((metadata, i) => {
      return { metadata: { ...metadata, text: contextTexts[i] } };
    });
    return {
-      response: responseText,
+      contextTexts,
      sources: this.curateSources(sources),
      message: false,
    };
--- a/server/utils/vectorDbProviders/lance/index.js
+++ b/server/utils/vectorDbProviders/lance/index.js
@ -4,7 +4,6 @@ const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
-const { chatPrompt } = require("../../chats");

 const LanceDb = {
  uri: `${
@ -226,83 +225,36 @@ const LanceDb = {
      return false;
    }
  },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
-
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
+  performSimilaritySearch: async function ({
    namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");

    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }

-    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });

+    const sources = sourceDocuments.map((metadata, i) => {
+      return { metadata: { ...metadata, text: contextTexts[i] } };
+    });
    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
      message: false,
    };
  },
@ -337,9 +289,13 @@ const LanceDb = {
  curateSources: function (sources = []) {
    const documents = [];
    for (const source of sources) {
-      const { text, vector: _v, score: _s, ...metadata } = source;
+      const { text, vector: _v, score: _s, ...rest } = source;
+      const metadata = rest.hasOwnProperty("metadata") ? rest.metadata : rest;
      if (Object.keys(metadata).length > 0) {
-        documents.push({ ...metadata, text });
+        documents.push({
+          ...metadata,
+          ...(text ? { text } : {}),
+        });
      }
    }

--- a/server/utils/vectorDbProviders/pinecone/index.js
+++ b/server/utils/vectorDbProviders/pinecone/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");

 const Pinecone = {
  name: "Pinecone",
@ -222,80 +221,33 @@ const Pinecone = {
      message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`,
    };
  },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
-
-    const { pineconeIndex } = await this.connect();
-    if (!(await this.namespaceExists(pineconeIndex, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      pineconeIndex,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
+  performSimilaritySearch: async function ({
    namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");

    const { pineconeIndex } = await this.connect();
    if (!(await this.namespaceExists(pineconeIndex, namespace)))
      throw new Error(
-        "Invalid namespace - has it been collected and seeded yet?"
+        "Invalid namespace - has it been collected and populated yet?"
      );

-    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      pineconeIndex,
      namespace,
      queryVector
    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });

+    const sources = sourceDocuments.map((metadata, i) => {
+      return { ...metadata, text: contextTexts[i] };
+    });
    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
      message: false,
    };
  },
--- a/server/utils/vectorDbProviders/qdrant/index.js
+++ b/server/utils/vectorDbProviders/qdrant/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");

 const QDrant = {
  name: "QDrant",
@ -262,83 +261,36 @@ const QDrant = {
    await DocumentVectors.deleteIds(indexes);
    return true;
  },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
-
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
+  performSimilaritySearch: async function ({
    namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");

    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }

-    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });

+    const sources = sourceDocuments.map((metadata, i) => {
+      return { ...metadata, text: contextTexts[i] };
+    });
    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
      message: false,
    };
  },
@ -377,8 +329,11 @@ const QDrant = {
    const documents = [];
    for (const source of sources) {
      if (Object.keys(source).length > 0) {
+        const metadata = source.hasOwnProperty("metadata")
+          ? source.metadata
+          : source;
        documents.push({
-          ...source,
+          ...metadata,
        });
      }
    }
--- a/server/utils/vectorDbProviders/weaviate/index.js
+++ b/server/utils/vectorDbProviders/weaviate/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");
 const { camelCase } = require("../../helpers/camelcase");

 const Weaviate = {
@ -333,83 +332,36 @@ const Weaviate = {
    await DocumentVectors.deleteIds(indexes);
    return true;
  },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
-
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
+  performSimilaritySearch: async function ({
    namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");

    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }

-    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });

+    const sources = sourceDocuments.map((metadata, i) => {
+      return { ...metadata, text: contextTexts[i] };
+    });
    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
      message: false,
    };
  },
@ -445,7 +397,10 @@ const Weaviate = {
    const documents = [];
    for (const source of sources) {
      if (Object.keys(source).length > 0) {
-        documents.push(source);
+        const metadata = source.hasOwnProperty("metadata")
+          ? source.metadata
+          : source;
+        documents.push({ ...metadata });
      }
    }

--- a/server/yarn.lock
+++ b/server/yarn.lock
@ -1556,7 +1556,7 @@ isomorphic-fetch@^3.0.0:
    node-fetch "^2.6.1"
    whatwg-fetch "^3.4.1"

-js-tiktoken@^1.0.6:
+js-tiktoken@^1.0.6, js-tiktoken@^1.0.7:
  version "1.0.7"
  resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5"
  integrity sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==