diff --git a/docker/.env.example b/docker/.env.example index 55f3b2627..2f9e23288 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -219,6 +219,11 @@ GID='1000' # TTS_OPEN_AI_KEY=sk-example # TTS_OPEN_AI_VOICE_MODEL=nova +# TTS_PROVIDER="generic-openai" +# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example +# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova +# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1" + # TTS_PROVIDER="elevenlabs" # TTS_ELEVEN_LABS_KEY= # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel diff --git a/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx b/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx new file mode 100644 index 000000000..2247544cd --- /dev/null +++ b/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx @@ -0,0 +1,69 @@ +import React from "react"; + +export default function OpenAiGenericTextToSpeechOptions({ settings }) { + return ( +
+
+
+
+ +
+ +

+ This should be the base URL of the OpenAI compatible TTS service you + will generate TTS responses from. +

+
+ +
+ + +

+ Some TTS services require an API key to generate TTS responses - + this is optional if your service does not require one. +

+
+
+ + +

+ Most TTS services will have several voice models available, this is + the identifier for the voice model you want to use. +

+
+
+
+ ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx index 88d063387..31ac70670 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx @@ -23,6 +23,7 @@ export default function TTSMessage({ slug, chatId, message }) { switch (provider) { case "openai": + case "generic-openai": case "elevenlabs": return ; case "piper_local": diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx index f1d7bbe1d..b7da93750 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx @@ -81,11 +81,13 @@ const HistoricalMessage = ({
- + {role === "assistant" && ( + + )}
{isEditing ? ( diff --git a/frontend/src/media/ttsproviders/generic-openai.png b/frontend/src/media/ttsproviders/generic-openai.png new file mode 100644 index 000000000..302f5dbee Binary files /dev/null and b/frontend/src/media/ttsproviders/generic-openai.png differ diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx index 0ebab72de..f337dc01d 100644 --- a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx +++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx @@ -8,10 +8,13 @@ import OpenAiLogo from "@/media/llmprovider/openai.png"; import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png"; import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png"; import PiperTTSIcon from "@/media/ttsproviders/piper.png"; +import GenericOpenAiLogo from "@/media/ttsproviders/generic-openai.png"; + import BrowserNative from "@/components/TextToSpeech/BrowserNative"; import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions"; import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions"; import PiperTTSOptions from "@/components/TextToSpeech/PiperTTSOptions"; +import OpenAiGenericTTSOptions from "@/components/TextToSpeech/OpenAiGenericOptions"; const PROVIDERS = [ { @@ -42,6 +45,14 @@ const PROVIDERS = [ options: (settings) => , description: "Run TTS models locally in your browser privately.", }, + { + name: "OpenAI Compatible", + value: "generic-openai", + logo: GenericOpenAiLogo, + options: (settings) => , + description: + "Connect to an OpenAI compatible TTS service running locally or remotely.", + }, ]; export default function TextToSpeechProvider({ settings }) { diff --git a/server/.env.example b/server/.env.example index e6a3871d6..3f60b0e5b 100644 --- a/server/.env.example +++ b/server/.env.example @@ -213,6 +213,11 @@ TTS_PROVIDER="native" # TTS_ELEVEN_LABS_KEY= # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel +# TTS_PROVIDER="generic-openai" +# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example +# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova +# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1" + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # STORAGE_DIR= # absolute filesystem path with no trailing slash diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 0c67bf2f4..c69794b48 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -221,12 +221,18 @@ const SystemSettings = { TextToSpeechProvider: process.env.TTS_PROVIDER || "native", TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY, TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL, + // Eleven Labs TTS TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY, TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL, // Piper TTS TTSPiperTTSVoiceModel: process.env.TTS_PIPER_VOICE_MODEL ?? "en_US-hfc_female-medium", + // OpenAI Generic TTS + TTSOpenAICompatibleKey: !!process.env.TTS_OPEN_AI_COMPATIBLE_KEY, + TTSOpenAICompatibleVoiceModel: + process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL, + TTSOpenAICompatibleEndpoint: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT, // -------------------------------------------------------- // Agent Settings & Configs diff --git a/server/utils/TextToSpeech/index.js b/server/utils/TextToSpeech/index.js index 155fc9540..5ed5684de 100644 --- a/server/utils/TextToSpeech/index.js +++ b/server/utils/TextToSpeech/index.js @@ -7,6 +7,9 @@ function getTTSProvider() { case "elevenlabs": const { ElevenLabsTTS } = require("./elevenLabs"); return new ElevenLabsTTS(); + case "generic-openai": + const { GenericOpenAiTTS } = require("./openAiGeneric"); + return new GenericOpenAiTTS(); default: throw new Error("ENV: No TTS_PROVIDER value found in environment!"); } diff --git a/server/utils/TextToSpeech/openAiGeneric/index.js b/server/utils/TextToSpeech/openAiGeneric/index.js new file mode 100644 index 000000000..df39e6348 --- /dev/null +++ b/server/utils/TextToSpeech/openAiGeneric/index.js @@ -0,0 +1,50 @@ +class GenericOpenAiTTS { + constructor() { + if (!process.env.TTS_OPEN_AI_COMPATIBLE_KEY) + this.#log( + "No OpenAI compatible API key was set. You might need to set this to use your OpenAI compatible TTS service." + ); + if (!process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL) + this.#log( + "No OpenAI compatible voice model was set. We will use the default voice model 'alloy'. This may not exist for your selected endpoint." + ); + if (!process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT) + throw new Error( + "No OpenAI compatible endpoint was set. Please set this to use your OpenAI compatible TTS service." + ); + + const { OpenAI: OpenAIApi } = require("openai"); + this.openai = new OpenAIApi({ + apiKey: process.env.TTS_OPEN_AI_COMPATIBLE_KEY || null, + baseURL: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT, + }); + this.voice = process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL ?? "alloy"; + } + + #log(text, ...args) { + console.log(`\x1b[32m[OpenAiGenericTTS]\x1b[0m ${text}`, ...args); + } + + /** + * Generates a buffer from the given text input using the OpenAI compatible TTS service. + * @param {string} textInput - The text to be converted to audio. + * @returns {Promise} A buffer containing the audio data. + */ + async ttsBuffer(textInput) { + try { + const result = await this.openai.audio.speech.create({ + model: "tts-1", + voice: this.voice, + input: textInput, + }); + return Buffer.from(await result.arrayBuffer()); + } catch (e) { + console.error(e); + } + return null; + } +} + +module.exports = { + GenericOpenAiTTS, +}; diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 160e85d44..294214a0b 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -506,6 +506,20 @@ const KEY_MAPPING = { checks: [], }, + // OpenAI Generic TTS + TTSOpenAICompatibleKey: { + envKey: "TTS_OPEN_AI_COMPATIBLE_KEY", + checks: [], + }, + TTSOpenAICompatibleVoiceModel: { + envKey: "TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL", + checks: [isNotEmpty], + }, + TTSOpenAICompatibleEndpoint: { + envKey: "TTS_OPEN_AI_COMPATIBLE_ENDPOINT", + checks: [isValidURL], + }, + // DeepSeek Options DeepSeekApiKey: { envKey: "DEEPSEEK_API_KEY", @@ -589,6 +603,7 @@ function supportedTTSProvider(input = "") { "openai", "elevenlabs", "piper_local", + "generic-openai", ].includes(input); return validSelection ? null : `${input} is not a valid TTS provider.`; }