diff --git a/docker/.env.example b/docker/.env.example
index 55f3b2627..2f9e23288 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -219,6 +219,11 @@ GID='1000'
# TTS_OPEN_AI_KEY=sk-example
# TTS_OPEN_AI_VOICE_MODEL=nova
+# TTS_PROVIDER="generic-openai"
+# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example
+# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova
+# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1"
+
# TTS_PROVIDER="elevenlabs"
# TTS_ELEVEN_LABS_KEY=
# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
diff --git a/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx b/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx
new file mode 100644
index 000000000..2247544cd
--- /dev/null
+++ b/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx
@@ -0,0 +1,69 @@
+import React from "react";
+
+export default function OpenAiGenericTextToSpeechOptions({ settings }) {
+ return (
+
+
+
+
+
+
+
+
+ This should be the base URL of the OpenAI compatible TTS service you
+ will generate TTS responses from.
+
+
+
+
+
+
+
+ Some TTS services require an API key to generate TTS responses -
+ this is optional if your service does not require one.
+
+
+
+
+
+
+ Most TTS services will have several voice models available, this is
+ the identifier for the voice model you want to use.
+
+
+
+
+ );
+}
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
index 88d063387..31ac70670 100644
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
@@ -23,6 +23,7 @@ export default function TTSMessage({ slug, chatId, message }) {
switch (provider) {
case "openai":
+ case "generic-openai":
case "elevenlabs":
return ;
case "piper_local":
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx
index 3fe0a7ac2..c311dd6ed 100644
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx
@@ -76,11 +76,13 @@ const HistoricalMessage = ({
-
+ {role === "assistant" && (
+
+ )}
{isEditing ? (
diff --git a/frontend/src/media/ttsproviders/generic-openai.png b/frontend/src/media/ttsproviders/generic-openai.png
new file mode 100644
index 000000000..302f5dbee
Binary files /dev/null and b/frontend/src/media/ttsproviders/generic-openai.png differ
diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
index e053c5475..6c4c3b450 100644
--- a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
@@ -8,10 +8,13 @@ import OpenAiLogo from "@/media/llmprovider/openai.png";
import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png";
import PiperTTSIcon from "@/media/ttsproviders/piper.png";
+import GenericOpenAiLogo from "@/media/ttsproviders/generic-openai.png";
+
import BrowserNative from "@/components/TextToSpeech/BrowserNative";
import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions";
import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions";
import PiperTTSOptions from "@/components/TextToSpeech/PiperTTSOptions";
+import OpenAiGenericTTSOptions from "@/components/TextToSpeech/OpenAiGenericOptions";
const PROVIDERS = [
{
@@ -42,6 +45,14 @@ const PROVIDERS = [
options: (settings) => ,
description: "Run TTS models locally in your browser privately.",
},
+ {
+ name: "OpenAI Compatible",
+ value: "generic-openai",
+ logo: GenericOpenAiLogo,
+ options: (settings) => ,
+ description:
+ "Connect to an OpenAI compatible TTS service running locally or remotely.",
+ },
];
export default function TextToSpeechProvider({ settings }) {
diff --git a/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx b/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx
index 5be1f184b..df850d33a 100644
--- a/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx
+++ b/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx
@@ -5,14 +5,30 @@ import paths from "@/utils/paths";
import { useTranslation } from "react-i18next";
import { Link, useParams } from "react-router-dom";
-// These models do NOT support function calling
+/**
+ * These models do NOT support function calling
+ * or do not support system prompts
+ * and therefore are not supported for agents.
+ * @param {string} provider - The AI provider.
+ * @param {string} model - The model name.
+ * @returns {boolean} Whether the model is supported for agents.
+ */
function supportedModel(provider, model = "") {
- if (provider !== "openai") return true;
- return (
- ["gpt-3.5-turbo-0301", "gpt-4-turbo-2024-04-09", "gpt-4-turbo"].includes(
- model
- ) === false
- );
+ if (provider === "openai") {
+ return (
+ [
+ "gpt-3.5-turbo-0301",
+ "gpt-4-turbo-2024-04-09",
+ "gpt-4-turbo",
+ "o1-preview",
+ "o1-preview-2024-09-12",
+ "o1-mini",
+ "o1-mini-2024-09-12",
+ ].includes(model) === false
+ );
+ }
+
+ return true;
}
export default function AgentModelSelection({
diff --git a/server/.env.example b/server/.env.example
index e6a3871d6..3f60b0e5b 100644
--- a/server/.env.example
+++ b/server/.env.example
@@ -213,6 +213,11 @@ TTS_PROVIDER="native"
# TTS_ELEVEN_LABS_KEY=
# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
+# TTS_PROVIDER="generic-openai"
+# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example
+# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova
+# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1"
+
# CLOUD DEPLOYMENT VARIRABLES ONLY
# AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
# STORAGE_DIR= # absolute filesystem path with no trailing slash
diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
index 0c67bf2f4..c69794b48 100644
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@@ -221,12 +221,18 @@ const SystemSettings = {
TextToSpeechProvider: process.env.TTS_PROVIDER || "native",
TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY,
TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL,
+
// Eleven Labs TTS
TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY,
TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL,
// Piper TTS
TTSPiperTTSVoiceModel:
process.env.TTS_PIPER_VOICE_MODEL ?? "en_US-hfc_female-medium",
+ // OpenAI Generic TTS
+ TTSOpenAICompatibleKey: !!process.env.TTS_OPEN_AI_COMPATIBLE_KEY,
+ TTSOpenAICompatibleVoiceModel:
+ process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL,
+ TTSOpenAICompatibleEndpoint: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT,
// --------------------------------------------------------
// Agent Settings & Configs
diff --git a/server/utils/AiProviders/bedrock/index.js b/server/utils/AiProviders/bedrock/index.js
index 28d0c2ce3..c271f7297 100644
--- a/server/utils/AiProviders/bedrock/index.js
+++ b/server/utils/AiProviders/bedrock/index.js
@@ -7,6 +7,20 @@ const { NativeEmbedder } = require("../../EmbeddingEngines/native");
// Docs: https://js.langchain.com/v0.2/docs/integrations/chat/bedrock_converse
class AWSBedrockLLM {
+ /**
+ * These models do not support system prompts
+ * It is not explicitly stated but it is observed that they do not use the system prompt
+ * in their responses and will crash when a system prompt is provided.
+ * We can add more models to this list as we discover them or new models are added.
+ * We may want to extend this list or make a user-config if using custom bedrock models.
+ */
+ noSystemPromptModels = [
+ "amazon.titan-text-express-v1",
+ "amazon.titan-text-lite-v1",
+ "cohere.command-text-v14",
+ "cohere.command-light-text-v14",
+ ];
+
constructor(embedder = null, modelPreference = null) {
if (!process.env.AWS_BEDROCK_LLM_ACCESS_KEY_ID)
throw new Error("No AWS Bedrock LLM profile id was set.");
@@ -59,6 +73,22 @@ class AWSBedrockLLM {
for (const chat of chats) {
if (!roleToMessageMap.hasOwnProperty(chat.role)) continue;
+
+ // When a model does not support system prompts, we need to handle it.
+ // We will add a new message that simulates the system prompt via a user message and AI response.
+ // This will allow the model to respond without crashing but we can still inject context.
+ if (
+ this.noSystemPromptModels.includes(this.model) &&
+ chat.role === "system"
+ ) {
+ this.#log(
+ `Model does not support system prompts! Simulating system prompt via Human/AI message pairs.`
+ );
+ langchainChats.push(new HumanMessage({ content: chat.content }));
+ langchainChats.push(new AIMessage({ content: "Okay." }));
+ continue;
+ }
+
const MessageClass = roleToMessageMap[chat.role];
langchainChats.push(new MessageClass({ content: chat.content }));
}
@@ -78,6 +108,10 @@ class AWSBedrockLLM {
);
}
+ #log(text, ...args) {
+ console.log(`\x1b[32m[AWSBedrock]\x1b[0m ${text}`, ...args);
+ }
+
streamingEnabled() {
return "streamGetChatCompletion" in this;
}
diff --git a/server/utils/AiProviders/modelMap.js b/server/utils/AiProviders/modelMap.js
index 99d78dc14..84e480b31 100644
--- a/server/utils/AiProviders/modelMap.js
+++ b/server/utils/AiProviders/modelMap.js
@@ -52,6 +52,10 @@ const MODEL_MAP = {
"gpt-4-turbo-preview": 128_000,
"gpt-4": 8_192,
"gpt-4-32k": 32_000,
+ "o1-preview": 128_000,
+ "o1-preview-2024-09-12": 128_000,
+ "o1-mini": 128_000,
+ "o1-mini-2024-09-12": 128_000,
},
deepseek: {
"deepseek-chat": 128_000,
diff --git a/server/utils/AiProviders/openAi/index.js b/server/utils/AiProviders/openAi/index.js
index b0e52dc2b..4f6bc2219 100644
--- a/server/utils/AiProviders/openAi/index.js
+++ b/server/utils/AiProviders/openAi/index.js
@@ -23,6 +23,14 @@ class OpenAiLLM {
this.defaultTemp = 0.7;
}
+ /**
+ * Check if the model is an o1 model.
+ * @returns {boolean}
+ */
+ get isO1Model() {
+ return this.model.startsWith("o1");
+ }
+
#appendContext(contextTexts = []) {
if (!contextTexts || !contextTexts.length) return "";
return (
@@ -36,6 +44,7 @@ class OpenAiLLM {
}
streamingEnabled() {
+ if (this.isO1Model) return false;
return "streamGetChatCompletion" in this;
}
@@ -98,8 +107,11 @@ class OpenAiLLM {
userPrompt = "",
attachments = [], // This is the specific attachment for only this prompt
}) {
+ // o1 Models do not support the "system" role
+ // in order to combat this, we can use the "user" role as a replacement for now
+ // https://community.openai.com/t/o1-models-do-not-support-system-role-in-chat-completion/953880
const prompt = {
- role: "system",
+ role: this.isO1Model ? "user" : "system",
content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
};
return [
@@ -122,7 +134,7 @@ class OpenAiLLM {
.create({
model: this.model,
messages,
- temperature,
+ temperature: this.isO1Model ? 1 : temperature, // o1 models only accept temperature 1
})
.catch((e) => {
throw new Error(e.message);
@@ -143,7 +155,7 @@ class OpenAiLLM {
model: this.model,
stream: true,
messages,
- temperature,
+ temperature: this.isO1Model ? 1 : temperature, // o1 models only accept temperature 1
});
return streamRequest;
}
diff --git a/server/utils/TextToSpeech/index.js b/server/utils/TextToSpeech/index.js
index 155fc9540..5ed5684de 100644
--- a/server/utils/TextToSpeech/index.js
+++ b/server/utils/TextToSpeech/index.js
@@ -7,6 +7,9 @@ function getTTSProvider() {
case "elevenlabs":
const { ElevenLabsTTS } = require("./elevenLabs");
return new ElevenLabsTTS();
+ case "generic-openai":
+ const { GenericOpenAiTTS } = require("./openAiGeneric");
+ return new GenericOpenAiTTS();
default:
throw new Error("ENV: No TTS_PROVIDER value found in environment!");
}
diff --git a/server/utils/TextToSpeech/openAiGeneric/index.js b/server/utils/TextToSpeech/openAiGeneric/index.js
new file mode 100644
index 000000000..df39e6348
--- /dev/null
+++ b/server/utils/TextToSpeech/openAiGeneric/index.js
@@ -0,0 +1,50 @@
+class GenericOpenAiTTS {
+ constructor() {
+ if (!process.env.TTS_OPEN_AI_COMPATIBLE_KEY)
+ this.#log(
+ "No OpenAI compatible API key was set. You might need to set this to use your OpenAI compatible TTS service."
+ );
+ if (!process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL)
+ this.#log(
+ "No OpenAI compatible voice model was set. We will use the default voice model 'alloy'. This may not exist for your selected endpoint."
+ );
+ if (!process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT)
+ throw new Error(
+ "No OpenAI compatible endpoint was set. Please set this to use your OpenAI compatible TTS service."
+ );
+
+ const { OpenAI: OpenAIApi } = require("openai");
+ this.openai = new OpenAIApi({
+ apiKey: process.env.TTS_OPEN_AI_COMPATIBLE_KEY || null,
+ baseURL: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT,
+ });
+ this.voice = process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL ?? "alloy";
+ }
+
+ #log(text, ...args) {
+ console.log(`\x1b[32m[OpenAiGenericTTS]\x1b[0m ${text}`, ...args);
+ }
+
+ /**
+ * Generates a buffer from the given text input using the OpenAI compatible TTS service.
+ * @param {string} textInput - The text to be converted to audio.
+ * @returns {Promise} A buffer containing the audio data.
+ */
+ async ttsBuffer(textInput) {
+ try {
+ const result = await this.openai.audio.speech.create({
+ model: "tts-1",
+ voice: this.voice,
+ input: textInput,
+ });
+ return Buffer.from(await result.arrayBuffer());
+ } catch (e) {
+ console.error(e);
+ }
+ return null;
+ }
+}
+
+module.exports = {
+ GenericOpenAiTTS,
+};
diff --git a/server/utils/helpers/customModels.js b/server/utils/helpers/customModels.js
index f3430cecc..086144bfe 100644
--- a/server/utils/helpers/customModels.js
+++ b/server/utils/helpers/customModels.js
@@ -128,7 +128,7 @@ async function openAiModels(apiKey = null) {
});
const gpts = allModels
- .filter((model) => model.id.startsWith("gpt"))
+ .filter((model) => model.id.startsWith("gpt") || model.id.startsWith("o1"))
.filter(
(model) => !model.id.includes("vision") && !model.id.includes("instruct")
)
diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
index 160e85d44..294214a0b 100644
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@@ -506,6 +506,20 @@ const KEY_MAPPING = {
checks: [],
},
+ // OpenAI Generic TTS
+ TTSOpenAICompatibleKey: {
+ envKey: "TTS_OPEN_AI_COMPATIBLE_KEY",
+ checks: [],
+ },
+ TTSOpenAICompatibleVoiceModel: {
+ envKey: "TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL",
+ checks: [isNotEmpty],
+ },
+ TTSOpenAICompatibleEndpoint: {
+ envKey: "TTS_OPEN_AI_COMPATIBLE_ENDPOINT",
+ checks: [isValidURL],
+ },
+
// DeepSeek Options
DeepSeekApiKey: {
envKey: "DEEPSEEK_API_KEY",
@@ -589,6 +603,7 @@ function supportedTTSProvider(input = "") {
"openai",
"elevenlabs",
"piper_local",
+ "generic-openai",
].includes(input);
return validSelection ? null : `${input} is not a valid TTS provider.`;
}