diff --git a/docker/.env.example b/docker/.env.example index 55f3b2627..2f9e23288 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -219,6 +219,11 @@ GID='1000' # TTS_OPEN_AI_KEY=sk-example # TTS_OPEN_AI_VOICE_MODEL=nova +# TTS_PROVIDER="generic-openai" +# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example +# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova +# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1" + # TTS_PROVIDER="elevenlabs" # TTS_ELEVEN_LABS_KEY= # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel diff --git a/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx b/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx new file mode 100644 index 000000000..2247544cd --- /dev/null +++ b/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx @@ -0,0 +1,69 @@ +import React from "react"; + +export default function OpenAiGenericTextToSpeechOptions({ settings }) { + return ( +
+
+
+
+ +
+ +

+ This should be the base URL of the OpenAI compatible TTS service you + will generate TTS responses from. +

+
+ +
+ + +

+ Some TTS services require an API key to generate TTS responses - + this is optional if your service does not require one. +

+
+
+ + +

+ Most TTS services will have several voice models available, this is + the identifier for the voice model you want to use. +

+
+
+
+ ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx index 88d063387..31ac70670 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx @@ -23,6 +23,7 @@ export default function TTSMessage({ slug, chatId, message }) { switch (provider) { case "openai": + case "generic-openai": case "elevenlabs": return ; case "piper_local": diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx index 3fe0a7ac2..c311dd6ed 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx @@ -76,11 +76,13 @@ const HistoricalMessage = ({
- + {role === "assistant" && ( + + )}
{isEditing ? ( diff --git a/frontend/src/media/ttsproviders/generic-openai.png b/frontend/src/media/ttsproviders/generic-openai.png new file mode 100644 index 000000000..302f5dbee Binary files /dev/null and b/frontend/src/media/ttsproviders/generic-openai.png differ diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx index e053c5475..6c4c3b450 100644 --- a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx +++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx @@ -8,10 +8,13 @@ import OpenAiLogo from "@/media/llmprovider/openai.png"; import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png"; import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png"; import PiperTTSIcon from "@/media/ttsproviders/piper.png"; +import GenericOpenAiLogo from "@/media/ttsproviders/generic-openai.png"; + import BrowserNative from "@/components/TextToSpeech/BrowserNative"; import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions"; import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions"; import PiperTTSOptions from "@/components/TextToSpeech/PiperTTSOptions"; +import OpenAiGenericTTSOptions from "@/components/TextToSpeech/OpenAiGenericOptions"; const PROVIDERS = [ { @@ -42,6 +45,14 @@ const PROVIDERS = [ options: (settings) => , description: "Run TTS models locally in your browser privately.", }, + { + name: "OpenAI Compatible", + value: "generic-openai", + logo: GenericOpenAiLogo, + options: (settings) => , + description: + "Connect to an OpenAI compatible TTS service running locally or remotely.", + }, ]; export default function TextToSpeechProvider({ settings }) { diff --git a/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx b/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx index 5be1f184b..df850d33a 100644 --- a/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx +++ b/frontend/src/pages/WorkspaceSettings/AgentConfig/AgentModelSelection/index.jsx @@ -5,14 +5,30 @@ import paths from "@/utils/paths"; import { useTranslation } from "react-i18next"; import { Link, useParams } from "react-router-dom"; -// These models do NOT support function calling +/** + * These models do NOT support function calling + * or do not support system prompts + * and therefore are not supported for agents. + * @param {string} provider - The AI provider. + * @param {string} model - The model name. + * @returns {boolean} Whether the model is supported for agents. + */ function supportedModel(provider, model = "") { - if (provider !== "openai") return true; - return ( - ["gpt-3.5-turbo-0301", "gpt-4-turbo-2024-04-09", "gpt-4-turbo"].includes( - model - ) === false - ); + if (provider === "openai") { + return ( + [ + "gpt-3.5-turbo-0301", + "gpt-4-turbo-2024-04-09", + "gpt-4-turbo", + "o1-preview", + "o1-preview-2024-09-12", + "o1-mini", + "o1-mini-2024-09-12", + ].includes(model) === false + ); + } + + return true; } export default function AgentModelSelection({ diff --git a/server/.env.example b/server/.env.example index e6a3871d6..3f60b0e5b 100644 --- a/server/.env.example +++ b/server/.env.example @@ -213,6 +213,11 @@ TTS_PROVIDER="native" # TTS_ELEVEN_LABS_KEY= # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel +# TTS_PROVIDER="generic-openai" +# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example +# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova +# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1" + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # STORAGE_DIR= # absolute filesystem path with no trailing slash diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 0c67bf2f4..c69794b48 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -221,12 +221,18 @@ const SystemSettings = { TextToSpeechProvider: process.env.TTS_PROVIDER || "native", TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY, TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL, + // Eleven Labs TTS TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY, TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL, // Piper TTS TTSPiperTTSVoiceModel: process.env.TTS_PIPER_VOICE_MODEL ?? "en_US-hfc_female-medium", + // OpenAI Generic TTS + TTSOpenAICompatibleKey: !!process.env.TTS_OPEN_AI_COMPATIBLE_KEY, + TTSOpenAICompatibleVoiceModel: + process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL, + TTSOpenAICompatibleEndpoint: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT, // -------------------------------------------------------- // Agent Settings & Configs diff --git a/server/utils/AiProviders/bedrock/index.js b/server/utils/AiProviders/bedrock/index.js index 28d0c2ce3..c271f7297 100644 --- a/server/utils/AiProviders/bedrock/index.js +++ b/server/utils/AiProviders/bedrock/index.js @@ -7,6 +7,20 @@ const { NativeEmbedder } = require("../../EmbeddingEngines/native"); // Docs: https://js.langchain.com/v0.2/docs/integrations/chat/bedrock_converse class AWSBedrockLLM { + /** + * These models do not support system prompts + * It is not explicitly stated but it is observed that they do not use the system prompt + * in their responses and will crash when a system prompt is provided. + * We can add more models to this list as we discover them or new models are added. + * We may want to extend this list or make a user-config if using custom bedrock models. + */ + noSystemPromptModels = [ + "amazon.titan-text-express-v1", + "amazon.titan-text-lite-v1", + "cohere.command-text-v14", + "cohere.command-light-text-v14", + ]; + constructor(embedder = null, modelPreference = null) { if (!process.env.AWS_BEDROCK_LLM_ACCESS_KEY_ID) throw new Error("No AWS Bedrock LLM profile id was set."); @@ -59,6 +73,22 @@ class AWSBedrockLLM { for (const chat of chats) { if (!roleToMessageMap.hasOwnProperty(chat.role)) continue; + + // When a model does not support system prompts, we need to handle it. + // We will add a new message that simulates the system prompt via a user message and AI response. + // This will allow the model to respond without crashing but we can still inject context. + if ( + this.noSystemPromptModels.includes(this.model) && + chat.role === "system" + ) { + this.#log( + `Model does not support system prompts! Simulating system prompt via Human/AI message pairs.` + ); + langchainChats.push(new HumanMessage({ content: chat.content })); + langchainChats.push(new AIMessage({ content: "Okay." })); + continue; + } + const MessageClass = roleToMessageMap[chat.role]; langchainChats.push(new MessageClass({ content: chat.content })); } @@ -78,6 +108,10 @@ class AWSBedrockLLM { ); } + #log(text, ...args) { + console.log(`\x1b[32m[AWSBedrock]\x1b[0m ${text}`, ...args); + } + streamingEnabled() { return "streamGetChatCompletion" in this; } diff --git a/server/utils/AiProviders/modelMap.js b/server/utils/AiProviders/modelMap.js index 99d78dc14..84e480b31 100644 --- a/server/utils/AiProviders/modelMap.js +++ b/server/utils/AiProviders/modelMap.js @@ -52,6 +52,10 @@ const MODEL_MAP = { "gpt-4-turbo-preview": 128_000, "gpt-4": 8_192, "gpt-4-32k": 32_000, + "o1-preview": 128_000, + "o1-preview-2024-09-12": 128_000, + "o1-mini": 128_000, + "o1-mini-2024-09-12": 128_000, }, deepseek: { "deepseek-chat": 128_000, diff --git a/server/utils/AiProviders/openAi/index.js b/server/utils/AiProviders/openAi/index.js index b0e52dc2b..4f6bc2219 100644 --- a/server/utils/AiProviders/openAi/index.js +++ b/server/utils/AiProviders/openAi/index.js @@ -23,6 +23,14 @@ class OpenAiLLM { this.defaultTemp = 0.7; } + /** + * Check if the model is an o1 model. + * @returns {boolean} + */ + get isO1Model() { + return this.model.startsWith("o1"); + } + #appendContext(contextTexts = []) { if (!contextTexts || !contextTexts.length) return ""; return ( @@ -36,6 +44,7 @@ class OpenAiLLM { } streamingEnabled() { + if (this.isO1Model) return false; return "streamGetChatCompletion" in this; } @@ -98,8 +107,11 @@ class OpenAiLLM { userPrompt = "", attachments = [], // This is the specific attachment for only this prompt }) { + // o1 Models do not support the "system" role + // in order to combat this, we can use the "user" role as a replacement for now + // https://community.openai.com/t/o1-models-do-not-support-system-role-in-chat-completion/953880 const prompt = { - role: "system", + role: this.isO1Model ? "user" : "system", content: `${systemPrompt}${this.#appendContext(contextTexts)}`, }; return [ @@ -122,7 +134,7 @@ class OpenAiLLM { .create({ model: this.model, messages, - temperature, + temperature: this.isO1Model ? 1 : temperature, // o1 models only accept temperature 1 }) .catch((e) => { throw new Error(e.message); @@ -143,7 +155,7 @@ class OpenAiLLM { model: this.model, stream: true, messages, - temperature, + temperature: this.isO1Model ? 1 : temperature, // o1 models only accept temperature 1 }); return streamRequest; } diff --git a/server/utils/TextToSpeech/index.js b/server/utils/TextToSpeech/index.js index 155fc9540..5ed5684de 100644 --- a/server/utils/TextToSpeech/index.js +++ b/server/utils/TextToSpeech/index.js @@ -7,6 +7,9 @@ function getTTSProvider() { case "elevenlabs": const { ElevenLabsTTS } = require("./elevenLabs"); return new ElevenLabsTTS(); + case "generic-openai": + const { GenericOpenAiTTS } = require("./openAiGeneric"); + return new GenericOpenAiTTS(); default: throw new Error("ENV: No TTS_PROVIDER value found in environment!"); } diff --git a/server/utils/TextToSpeech/openAiGeneric/index.js b/server/utils/TextToSpeech/openAiGeneric/index.js new file mode 100644 index 000000000..df39e6348 --- /dev/null +++ b/server/utils/TextToSpeech/openAiGeneric/index.js @@ -0,0 +1,50 @@ +class GenericOpenAiTTS { + constructor() { + if (!process.env.TTS_OPEN_AI_COMPATIBLE_KEY) + this.#log( + "No OpenAI compatible API key was set. You might need to set this to use your OpenAI compatible TTS service." + ); + if (!process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL) + this.#log( + "No OpenAI compatible voice model was set. We will use the default voice model 'alloy'. This may not exist for your selected endpoint." + ); + if (!process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT) + throw new Error( + "No OpenAI compatible endpoint was set. Please set this to use your OpenAI compatible TTS service." + ); + + const { OpenAI: OpenAIApi } = require("openai"); + this.openai = new OpenAIApi({ + apiKey: process.env.TTS_OPEN_AI_COMPATIBLE_KEY || null, + baseURL: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT, + }); + this.voice = process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL ?? "alloy"; + } + + #log(text, ...args) { + console.log(`\x1b[32m[OpenAiGenericTTS]\x1b[0m ${text}`, ...args); + } + + /** + * Generates a buffer from the given text input using the OpenAI compatible TTS service. + * @param {string} textInput - The text to be converted to audio. + * @returns {Promise} A buffer containing the audio data. + */ + async ttsBuffer(textInput) { + try { + const result = await this.openai.audio.speech.create({ + model: "tts-1", + voice: this.voice, + input: textInput, + }); + return Buffer.from(await result.arrayBuffer()); + } catch (e) { + console.error(e); + } + return null; + } +} + +module.exports = { + GenericOpenAiTTS, +}; diff --git a/server/utils/helpers/customModels.js b/server/utils/helpers/customModels.js index f3430cecc..086144bfe 100644 --- a/server/utils/helpers/customModels.js +++ b/server/utils/helpers/customModels.js @@ -128,7 +128,7 @@ async function openAiModels(apiKey = null) { }); const gpts = allModels - .filter((model) => model.id.startsWith("gpt")) + .filter((model) => model.id.startsWith("gpt") || model.id.startsWith("o1")) .filter( (model) => !model.id.includes("vision") && !model.id.includes("instruct") ) diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 160e85d44..294214a0b 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -506,6 +506,20 @@ const KEY_MAPPING = { checks: [], }, + // OpenAI Generic TTS + TTSOpenAICompatibleKey: { + envKey: "TTS_OPEN_AI_COMPATIBLE_KEY", + checks: [], + }, + TTSOpenAICompatibleVoiceModel: { + envKey: "TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL", + checks: [isNotEmpty], + }, + TTSOpenAICompatibleEndpoint: { + envKey: "TTS_OPEN_AI_COMPATIBLE_ENDPOINT", + checks: [isValidURL], + }, + // DeepSeek Options DeepSeekApiKey: { envKey: "DEEPSEEK_API_KEY", @@ -589,6 +603,7 @@ function supportedTTSProvider(input = "") { "openai", "elevenlabs", "piper_local", + "generic-openai", ].includes(input); return validSelection ? null : `${input} is not a valid TTS provider.`; }