Tts open ai compatible endpoints (#2487)

* Update OpenAI TTS config to allow a custom BaseURL * uncheck config file * break openai generic TTS into its own provider * add space * hide TTS on user msg --------- Co-authored-by: Adam <phazei@gmail.com>
2024-11-19 04:30:10 +01:00 · 2024-10-15 21:39:31 -07:00 · 2024-10-15 21:39:31 -07:00 · 3dc0f3f490
commit 3dc0f3f490
parent fa528e0cf3
11 changed files with 172 additions and 5 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -219,6 +219,11 @@ GID='1000'
 # TTS_OPEN_AI_KEY=sk-example
 # TTS_OPEN_AI_VOICE_MODEL=nova

+# TTS_PROVIDER="generic-openai"
+# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example
+# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova
+# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1"
+
 # TTS_PROVIDER="elevenlabs"
 # TTS_ELEVEN_LABS_KEY=
 # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
--- a/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx
+++ b/frontend/src/components/TextToSpeech/OpenAiGenericOptions/index.jsx
@ -0,0 +1,69 @@
+import React from "react";
+
+export default function OpenAiGenericTextToSpeechOptions({ settings }) {
+  return (
+    <div className="w-full flex flex-col gap-y-7">
+      <div className="flex gap-x-4">
+        <div className="flex flex-col w-60">
+          <div className="flex justify-between items-center mb-2">
+            <label className="text-white text-sm font-semibold">Base URL</label>
+          </div>
+          <input
+            type="url"
+            name="TTSOpenAICompatibleEndpoint"
+            className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+            placeholder="http://localhost:7851/v1"
+            defaultValue={settings?.TTSOpenAICompatibleEndpoint}
+            required={false}
+            autoComplete="off"
+            spellCheck={false}
+          />
+          <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
+            This should be the base URL of the OpenAI compatible TTS service you
+            will generate TTS responses from.
+          </p>
+        </div>
+
+        <div className="flex flex-col w-60">
+          <label className="text-white text-sm font-semibold block mb-3">
+            API Key
+          </label>
+          <input
+            type="password"
+            name="TTSOpenAICompatibleKey"
+            className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+            placeholder="API Key"
+            defaultValue={
+              settings?.TTSOpenAICompatibleKey ? "*".repeat(20) : ""
+            }
+            autoComplete="off"
+            spellCheck={false}
+          />
+          <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
+            Some TTS services require an API key to generate TTS responses -
+            this is optional if your service does not require one.
+          </p>
+        </div>
+        <div className="flex flex-col w-60">
+          <label className="text-white text-sm font-semibold block mb-3">
+            Voice Model
+          </label>
+          <input
+            type="text"
+            name="TTSOpenAICompatibleVoiceModel"
+            className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+            placeholder="Your voice model identifier"
+            defaultValue={settings?.TTSOpenAICompatibleVoiceModel}
+            required={true}
+            autoComplete="off"
+            spellCheck={false}
+          />
+          <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
+            Most TTS services will have several voice models available, this is
+            the identifier for the voice model you want to use.
+          </p>
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
@ -23,6 +23,7 @@ export default function TTSMessage({ slug, chatId, message }) {

  switch (provider) {
    case "openai":
+    case "generic-openai":
    case "elevenlabs":
      return <AsyncTTSMessage slug={slug} chatId={chatId} />;
    case "piper_local":
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/index.jsx
@ -81,11 +81,13 @@ const HistoricalMessage = ({
          <div className="flex flex-col items-center">
            <ProfileImage role={role} workspace={workspace} />
            <div className="mt-1 -mb-10">
+              {role === "assistant" && (
                <TTSMessage
                  slug={workspace?.slug}
                  chatId={chatId}
                  message={message}
                />
+              )}
            </div>
          </div>
          {isEditing ? (
--- a/frontend/src/media/ttsproviders/generic-openai.png
+++ b/frontend/src/media/ttsproviders/generic-openai.png
--- a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
@ -8,10 +8,13 @@ import OpenAiLogo from "@/media/llmprovider/openai.png";
 import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
 import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png";
 import PiperTTSIcon from "@/media/ttsproviders/piper.png";
+import GenericOpenAiLogo from "@/media/ttsproviders/generic-openai.png";
+
 import BrowserNative from "@/components/TextToSpeech/BrowserNative";
 import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions";
 import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions";
 import PiperTTSOptions from "@/components/TextToSpeech/PiperTTSOptions";
+import OpenAiGenericTTSOptions from "@/components/TextToSpeech/OpenAiGenericOptions";

 const PROVIDERS = [
  {
@ -42,6 +45,14 @@ const PROVIDERS = [
    options: (settings) => <PiperTTSOptions settings={settings} />,
    description: "Run TTS models locally in your browser privately.",
  },
+  {
+    name: "OpenAI Compatible",
+    value: "generic-openai",
+    logo: GenericOpenAiLogo,
+    options: (settings) => <OpenAiGenericTTSOptions settings={settings} />,
+    description:
+      "Connect to an OpenAI compatible TTS service running locally or remotely.",
+  },
 ];

 export default function TextToSpeechProvider({ settings }) {
--- a/server/.env.example
+++ b/server/.env.example
@ -213,6 +213,11 @@ TTS_PROVIDER="native"
 # TTS_ELEVEN_LABS_KEY=
 # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel

+# TTS_PROVIDER="generic-openai"
+# TTS_OPEN_AI_COMPATIBLE_KEY=sk-example
+# TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL=nova
+# TTS_OPEN_AI_COMPATIBLE_ENDPOINT="https://api.openai.com/v1"
+
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # STORAGE_DIR= # absolute filesystem path with no trailing slash
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -221,12 +221,18 @@ const SystemSettings = {
      TextToSpeechProvider: process.env.TTS_PROVIDER || "native",
      TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY,
      TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL,
+
      // Eleven Labs TTS
      TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY,
      TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL,
      // Piper TTS
      TTSPiperTTSVoiceModel:
        process.env.TTS_PIPER_VOICE_MODEL ?? "en_US-hfc_female-medium",
+      // OpenAI Generic TTS
+      TTSOpenAICompatibleKey: !!process.env.TTS_OPEN_AI_COMPATIBLE_KEY,
+      TTSOpenAICompatibleVoiceModel:
+        process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL,
+      TTSOpenAICompatibleEndpoint: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT,

      // --------------------------------------------------------
      // Agent Settings & Configs
--- a/server/utils/TextToSpeech/index.js
+++ b/server/utils/TextToSpeech/index.js
@ -7,6 +7,9 @@ function getTTSProvider() {
    case "elevenlabs":
      const { ElevenLabsTTS } = require("./elevenLabs");
      return new ElevenLabsTTS();
+    case "generic-openai":
+      const { GenericOpenAiTTS } = require("./openAiGeneric");
+      return new GenericOpenAiTTS();
    default:
      throw new Error("ENV: No TTS_PROVIDER value found in environment!");
  }
--- a/server/utils/TextToSpeech/openAiGeneric/index.js
+++ b/server/utils/TextToSpeech/openAiGeneric/index.js
@ -0,0 +1,50 @@
+class GenericOpenAiTTS {
+  constructor() {
+    if (!process.env.TTS_OPEN_AI_COMPATIBLE_KEY)
+      this.#log(
+        "No OpenAI compatible API key was set. You might need to set this to use your OpenAI compatible TTS service."
+      );
+    if (!process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL)
+      this.#log(
+        "No OpenAI compatible voice model was set. We will use the default voice model 'alloy'. This may not exist for your selected endpoint."
+      );
+    if (!process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT)
+      throw new Error(
+        "No OpenAI compatible endpoint was set. Please set this to use your OpenAI compatible TTS service."
+      );
+
+    const { OpenAI: OpenAIApi } = require("openai");
+    this.openai = new OpenAIApi({
+      apiKey: process.env.TTS_OPEN_AI_COMPATIBLE_KEY || null,
+      baseURL: process.env.TTS_OPEN_AI_COMPATIBLE_ENDPOINT,
+    });
+    this.voice = process.env.TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL ?? "alloy";
+  }
+
+  #log(text, ...args) {
+    console.log(`\x1b[32m[OpenAiGenericTTS]\x1b[0m ${text}`, ...args);
+  }
+
+  /**
+   * Generates a buffer from the given text input using the OpenAI compatible TTS service.
+   * @param {string} textInput - The text to be converted to audio.
+   * @returns {Promise<Buffer>} A buffer containing the audio data.
+   */
+  async ttsBuffer(textInput) {
+    try {
+      const result = await this.openai.audio.speech.create({
+        model: "tts-1",
+        voice: this.voice,
+        input: textInput,
+      });
+      return Buffer.from(await result.arrayBuffer());
+    } catch (e) {
+      console.error(e);
+    }
+    return null;
+  }
+}
+
+module.exports = {
+  GenericOpenAiTTS,
+};
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -506,6 +506,20 @@ const KEY_MAPPING = {
    checks: [],
  },

+  // OpenAI Generic TTS
+  TTSOpenAICompatibleKey: {
+    envKey: "TTS_OPEN_AI_COMPATIBLE_KEY",
+    checks: [],
+  },
+  TTSOpenAICompatibleVoiceModel: {
+    envKey: "TTS_OPEN_AI_COMPATIBLE_VOICE_MODEL",
+    checks: [isNotEmpty],
+  },
+  TTSOpenAICompatibleEndpoint: {
+    envKey: "TTS_OPEN_AI_COMPATIBLE_ENDPOINT",
+    checks: [isValidURL],
+  },
+
  // DeepSeek Options
  DeepSeekApiKey: {
    envKey: "DEEPSEEK_API_KEY",
@ -589,6 +603,7 @@ function supportedTTSProvider(input = "") {
    "openai",
    "elevenlabs",
    "piper_local",
+    "generic-openai",
  ].includes(input);
  return validSelection ? null : `${input} is not a valid TTS provider.`;
 }