Add Speech-to-text and Text-to-speech providers (#1394)

* Add Speech-to-text and Text-to-speech providers * add files and update comment * update comments * patch: bad playerRef check
2024-11-10 17:00:11 +01:00 · 2024-05-14 11:57:21 -07:00 · 2024-05-14 11:57:21 -07:00 · b6be43be95
commit b6be43be95
parent d71db22799
33 changed files with 1234 additions and 68 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -11,6 +11,7 @@
    "cooldowns",
    "Deduplicator",
    "Dockerized",
    "elevenlabs",
    "Embeddable",
    "epub",
    "GROQ",
--- a/docker/.env.example
+++ b/docker/.env.example
@ -171,6 +171,19 @@ GID='1000'
 # WHISPER_PROVIDER="openai"
 # OPEN_AI_KEY=sk-xxxxxxxx
 ###########################################
 ######## TTS/STT Model Selection ##########
 ###########################################
 # TTS_PROVIDER="native"
 # TTS_PROVIDER="openai"
 # TTS_OPEN_AI_KEY=sk-example
 # TTS_OPEN_AI_VOICE_MODEL=nova
 # TTS_PROVIDER="elevenlabs"
 # TTS_ELEVEN_LABS_KEY=
 # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # DISABLE_TELEMETRY="false"
--- a/frontend/package.json
+++ b/frontend/package.json
@ -28,6 +28,7 @@
    "react-dropzone": "^14.2.3",
    "react-loading-skeleton": "^3.1.0",
    "react-router-dom": "^6.3.0",
    "react-speech-recognition": "^3.10.0",
    "react-tag-input-component": "^2.0.2",
    "react-toastify": "^9.1.3",
    "react-tooltip": "^5.25.2",
--- a/frontend/src/App.jsx
+++ b/frontend/src/App.jsx
@ -32,6 +32,9 @@ const GeneralLLMPreference = lazy(
 const GeneralTranscriptionPreference = lazy(
  () => import("@/pages/GeneralSettings/TranscriptionPreference")
 );
 const GeneralAudioPreference = lazy(
  () => import("@/pages/GeneralSettings/AudioPreference")
 );
 const GeneralEmbeddingPreference = lazy(
  () => import("@/pages/GeneralSettings/EmbeddingPreference")
 );
@ -85,6 +88,10 @@ export default function App() {
                  <AdminRoute Component={GeneralTranscriptionPreference} />
                }
              />
              <Route
                path="/settings/audio-preference"
                element={<AdminRoute Component={GeneralAudioPreference} />}
              />
              <Route
                path="/settings/embedding-preference"
                element={<AdminRoute Component={GeneralEmbeddingPreference} />}
--- a/frontend/src/components/SettingsSidebar/index.jsx
+++ b/frontend/src/components/SettingsSidebar/index.jsx
@ -21,6 +21,7 @@ import {
  ClosedCaptioning,
  EyeSlash,
  SplitVertical,
  Microphone,
 } from "@phosphor-icons/react";
 import useUser from "@/hooks/useUser";
 import { USER_BACKGROUND_COLOR } from "@/utils/constants";
@ -280,6 +281,14 @@ const SidebarOptions = ({ user = null }) => (
      flex={true}
      allowedRole={["admin"]}
    />
    <Option
      href={paths.settings.audioPreference()}
      btnText="Voice and Speech Support"
      icon={<Microphone className="h-5 w-5 flex-shrink-0" />}
      user={user}
      flex={true}
      allowedRole={["admin"]}
    />
    <Option
      href={paths.settings.transcriptionPreference()}
      btnText="Transcription Model"
--- a/frontend/src/components/SpeechToText/BrowserNative/index.jsx
+++ b/frontend/src/components/SpeechToText/BrowserNative/index.jsx
@ -0,0 +1,9 @@
 export default function BrowserNative() {
  return (
    <div className="w-full h-10 items-center flex">
      <p className="text-sm font-base text-white text-opacity-60">
        There is no configuration needed for this provider.
      </p>
    </div>
  );
 }
--- a/frontend/src/components/TextToSpeech/BrowserNative/index.jsx
+++ b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx
@ -0,0 +1,9 @@
 export default function BrowserNative() {
  return (
    <div className="w-full h-10 items-center flex">
      <p className="text-sm font-base text-white text-opacity-60">
        There is no configuration needed for this provider.
      </p>
    </div>
  );
 }
--- a/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx
+++ b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx
@ -0,0 +1,107 @@
 import { useState, useEffect } from "react";
 import System from "@/models/system";
 export default function ElevenLabsOptions({ settings }) {
  const [inputValue, setInputValue] = useState(settings?.TTSElevenLabsKey);
  const [openAIKey, setOpenAIKey] = useState(settings?.TTSElevenLabsKey);
  return (
    <div className="flex gap-x-4">
      <div className="flex flex-col w-60">
        <label className="text-white text-sm font-semibold block mb-4">
          API Key
        </label>
        <input
          type="password"
          name="TTSElevenLabsKey"
          className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
          placeholder="ElevenLabs API Key"
          defaultValue={settings?.TTSElevenLabsKey ? "*".repeat(20) : ""}
          required={true}
          autoComplete="off"
          spellCheck={false}
          onChange={(e) => setInputValue(e.target.value)}
          onBlur={() => setOpenAIKey(inputValue)}
        />
      </div>
      {!settings?.credentialsOnly && (
        <ElevenLabsModelSelection settings={settings} apiKey={openAIKey} />
      )}
    </div>
  );
 }
 function ElevenLabsModelSelection({ apiKey, settings }) {
  const [groupedModels, setGroupedModels] = useState({});
  const [loading, setLoading] = useState(true);
  useEffect(() => {
    async function findCustomModels() {
      setLoading(true);
      const { models } = await System.customModels(
        "elevenlabs-tts",
        typeof apiKey === "boolean" ? null : apiKey
      );
      if (models?.length > 0) {
        const modelsByOrganization = models.reduce((acc, model) => {
          acc[model.organization] = acc[model.organization] || [];
          acc[model.organization].push(model);
          return acc;
        }, {});
        setGroupedModels(modelsByOrganization);
      }
      setLoading(false);
    }
    findCustomModels();
  }, [apiKey]);
  if (loading) {
    return (
      <div className="flex flex-col w-60">
        <label className="text-white text-sm font-semibold block mb-4">
          Chat Model Selection
        </label>
        <select
          name="TTSElevenLabsVoiceModel"
          disabled={true}
          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
        >
          <option disabled={true} selected={true}>
            -- loading available models --
          </option>
        </select>
      </div>
    );
  }
  return (
    <div className="flex flex-col w-60">
      <label className="text-white text-sm font-semibold block mb-4">
        Chat Model Selection
      </label>
      <select
        name="TTSElevenLabsVoiceModel"
        required={true}
        className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
      >
        {Object.keys(groupedModels)
          .sort()
          .map((organization) => (
            <optgroup key={organization} label={organization}>
              {groupedModels[organization].map((model) => (
                <option
                  key={model.id}
                  value={model.id}
                  selected={settings?.OpenAiModelPref === model.id}
                >
                  {model.name}
                </option>
              ))}
            </optgroup>
          ))}
      </select>
    </div>
  );
 }
--- a/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx
+++ b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx
@ -0,0 +1,45 @@
 function toProperCase(string) {
  return string.replace(/\w\S*/g, function (txt) {
    return txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase();
  });
 }
 export default function OpenAiTextToSpeechOptions({ settings }) {
  const apiKey = settings?.TTSOpenAIKey;
  return (
    <div className="flex gap-x-4">
      <div className="flex flex-col w-60">
        <label className="text-white text-sm font-semibold block mb-4">
          API Key
        </label>
        <input
          type="password"
          name="TTSOpenAIKey"
          className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
          placeholder="OpenAI API Key"
          defaultValue={apiKey ? "*".repeat(20) : ""}
          required={true}
          autoComplete="off"
          spellCheck={false}
        />
      </div>
      <div className="flex flex-col w-60">
        <label className="text-white text-sm font-semibold block mb-4">
          Voice Model
        </label>
        <select
          name="TTSOpenAIVoiceModel"
          defaultValue={settings?.TTSOpenAIVoiceModel ?? "alloy"}
          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
        >
          {["alloy", "echo", "fable", "onyx", "nova", "shimmer"].map(
            (voice) => {
              return <option value={voice}>{toProperCase(voice)}</option>;
            }
          )}
        </select>
      </div>
    </div>
  );
 }
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx
@ -0,0 +1,94 @@
 import { useEffect, useState, useRef } from "react";
 import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
 import { Tooltip } from "react-tooltip";
 import Workspace from "@/models/workspace";
 import showToast from "@/utils/toast";
 export default function AsyncTTSMessage({ slug, chatId }) {
  const playerRef = useRef(null);
  const [speaking, setSpeaking] = useState(false);
  const [loading, setLoading] = useState(false);
  const [audioSrc, setAudioSrc] = useState(null);
  function speakMessage() {
    if (speaking) {
      playerRef?.current?.pause();
      return;
    }
    try {
      if (!audioSrc) {
        setLoading(true);
        Workspace.ttsMessage(slug, chatId)
          .then((audioBlob) => {
            if (!audioBlob)
              throw new Error("Failed to load or play TTS message response.");
            setAudioSrc(audioBlob);
          })
          .catch((e) => showToast(e.message, "error", { clear: true }))
          .finally(() => setLoading(false));
      } else {
        playerRef.current.play();
      }
    } catch (e) {
      console.error(e);
      setLoading(false);
      setSpeaking(false);
    }
  }
  useEffect(() => {
    function setupPlayer() {
      if (!playerRef?.current) return;
      playerRef.current.addEventListener("play", () => {
        setSpeaking(true);
      });
      playerRef.current.addEventListener("pause", () => {
        playerRef.current.currentTime = 0;
        setSpeaking(false);
      });
    }
    setupPlayer();
  }, []);
  if (!chatId) return null;
  return (
    <div className="mt-3 relative">
      <button
        onClick={speakMessage}
        data-tooltip-id="message-to-speech"
        data-tooltip-content={
          speaking ? "Pause TTS speech of message" : "TTS Speak message"
        }
        className="border-none text-zinc-300"
        aria-label={speaking ? "Pause speech" : "Speak message"}
      >
        {speaking ? (
          <PauseCircle size={18} className="mb-1" />
        ) : (
          <>
            {loading ? (
              <CircleNotch size={18} className="mb-1 animate-spin" />
            ) : (
              <SpeakerHigh size={18} className="mb-1" />
            )}
          </>
        )}
        <audio
          ref={playerRef}
          hidden={true}
          src={audioSrc}
          autoPlay={true}
          controls={false}
        />
      </button>
      <Tooltip
        id="message-to-speech"
        place="bottom"
        delayShow={300}
        className="tooltip !text-xs"
      />
    </div>
  );
 }
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
@ -0,0 +1,23 @@
 import { useEffect, useState } from "react";
 import NativeTTSMessage from "./native";
 import AsyncTTSMessage from "./asyncTts";
 import System from "@/models/system";
 export default function TTSMessage({ slug, chatId, message }) {
  const [provider, setProvider] = useState("native");
  const [loading, setLoading] = useState(true);
  useEffect(() => {
    async function getSettings() {
      const _settings = await System.keys();
      setProvider(_settings?.TextToSpeechProvider ?? "native");
      setLoading(false);
    }
    getSettings();
  }, []);
  if (loading) return null;
  if (provider !== "native")
    return <AsyncTTSMessage slug={slug} chatId={chatId} />;
  return <NativeTTSMessage message={message} />;
 }
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx
@ -0,0 +1,61 @@
 import React, { useEffect, useState } from "react";
 import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react";
 import { Tooltip } from "react-tooltip";
 export default function NativeTTSMessage({ message }) {
  const [speaking, setSpeaking] = useState(false);
  const [supported, setSupported] = useState(false);
  useEffect(() => {
    setSupported("speechSynthesis" in window);
  }, []);
  function endSpeechUtterance() {
    window.speechSynthesis?.cancel();
    setSpeaking(false);
    return;
  }
  function speakMessage() {
    // if the user is pausing this particular message
    // while the synth is speaking we can end it.
    // If they are clicking another message's TTS
    // we need to ignore that until they pause the one that is playing.
    if (window.speechSynthesis.speaking && speaking) {
      endSpeechUtterance();
      return;
    }
    if (window.speechSynthesis.speaking && !speaking) return;
    const utterance = new SpeechSynthesisUtterance(message);
    utterance.addEventListener("end", endSpeechUtterance);
    window.speechSynthesis.speak(utterance);
    setSpeaking(true);
  }
  if (!supported) return null;
  return (
    <div className="mt-3 relative">
      <button
        onClick={speakMessage}
        data-tooltip-id="message-to-speech"
        data-tooltip-content={
          speaking ? "Pause TTS speech of message" : "TTS Speak message"
        }
        className="border-none text-zinc-300"
        aria-label={speaking ? "Pause speech" : "Speak message"}
      >
        {speaking ? (
          <PauseCircle size={18} className="mb-1" />
        ) : (
          <SpeakerHigh size={18} className="mb-1" />
        )}
      </button>
      <Tooltip
        id="message-to-speech"
        place="bottom"
        delayShow={300}
        className="tooltip !text-xs"
      />
    </div>
  );
 }
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx
@ -1,4 +1,4 @@
-import React, { memo, useEffect, useState } from "react";
+import React, { memo, useState } from "react";
 import useCopyText from "@/hooks/useCopyText";
 import {
  Check,
@ -6,11 +6,10 @@ import {
  ThumbsUp,
  ThumbsDown,
  ArrowsClockwise,
  SpeakerHigh,
  PauseCircle,
 } from "@phosphor-icons/react";
 import { Tooltip } from "react-tooltip";
 import Workspace from "@/models/workspace";
 import TTSMessage from "./TTSButton";
 const Actions = ({
  message,
@ -60,7 +59,7 @@ const Actions = ({
          </>
        )}
      </div>
-      <TTSMessage message={message} />
+      <TTSMessage slug={slug} chatId={chatId} message={message} />
    </div>
  );
 };
@ -149,62 +148,4 @@ function RegenerateMessage({ regenerateMessage, chatId }) {
  );
 }
 function TTSMessage({ message }) {
  const [speaking, setSpeaking] = useState(false);
  const [supported, setSupported] = useState(false);
  useEffect(() => {
    setSupported("speechSynthesis" in window);
  }, []);
  function endSpeechUtterance() {
    window.speechSynthesis?.cancel();
    setSpeaking(false);
    return;
  }
  function speakMessage() {
    // if the user is pausing this particular message
    // while the synth if speaking we can end it.
    // If they are clicking another message's TTS
    // we need to ignore that until they pause the one that is playing.
    if (window.speechSynthesis.speaking && speaking) {
      endSpeechUtterance();
      return;
    }
    if (window.speechSynthesis.speaking && !speaking) return;
    const utterance = new SpeechSynthesisUtterance(message);
    utterance.addEventListener("end", endSpeechUtterance);
    window.speechSynthesis.speak(utterance);
    setSpeaking(true);
  }
  if (!supported) return null;
  return (
    <div className="mt-3 relative">
      <button
        onClick={speakMessage}
        data-tooltip-id="message-to-speech"
        data-tooltip-content={
          speaking ? "Pause TTS speech of message" : "TTS Speak message"
        }
        className="border-none text-zinc-300"
        aria-label={speaking ? "Pause speech" : "Speak message"}
      >
        {speaking ? (
          <PauseCircle size={18} className="mb-1" />
        ) : (
          <SpeakerHigh size={18} className="mb-1" />
        )}
      </button>
      <Tooltip
        id="message-to-speech"
        place="bottom"
        delayShow={300}
        className="tooltip !text-xs"
      />
    </div>
  );
 }
 export default memo(Actions);
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx
@ -0,0 +1,82 @@
 import { useEffect } from "react";
 import { Microphone } from "@phosphor-icons/react";
 import { Tooltip } from "react-tooltip";
 import _regeneratorRuntime from "regenerator-runtime";
 import SpeechRecognition, {
  useSpeechRecognition,
 } from "react-speech-recognition";
 let timeout;
 const SILENCE_INTERVAL = 3_200; // wait in seconds of silence before closing.
 export default function SpeechToText({ sendCommand }) {
  const {
    transcript,
    listening,
    resetTranscript,
    browserSupportsSpeechRecognition,
    browserSupportsContinuousListening,
    isMicrophoneAvailable,
  } = useSpeechRecognition({
    clearTranscriptOnListen: true,
  });
  function startSTTSession() {
    if (!isMicrophoneAvailable) {
      alert(
        "AnythingLLM does not have access to microphone. Please enable for this site to use this feature."
      );
      return;
    }
    resetTranscript();
    SpeechRecognition.startListening({
      continuous: browserSupportsContinuousListening,
      language: window?.navigator?.language ?? "en-US",
    });
  }
  function endTTSSession() {
    SpeechRecognition.stopListening();
    if (transcript.length > 0) {
      sendCommand(transcript, true);
    }
    resetTranscript();
    clearTimeout(timeout);
  }
  useEffect(() => {
    if (transcript?.length > 0) {
      sendCommand(transcript, false);
      clearTimeout(timeout);
      timeout = setTimeout(() => {
        endTTSSession();
      }, SILENCE_INTERVAL);
    }
  }, [transcript]);
  if (!browserSupportsSpeechRecognition) return null;
  return (
    <div
      id="text-size-btn"
      data-tooltip-id="tooltip-text-size-btn"
      data-tooltip-content="Speak your prompt"
      aria-label="Speak your prompt"
      onClick={listening ? endTTSSession : startSTTSession}
      className={`relative flex justify-center items-center opacity-60 hover:opacity-100 cursor-pointer ${
        !!listening ? "!opacity-100" : ""
      }`}
    >
      <Microphone
        weight="fill"
        className="w-6 h-6 pointer-events-none text-white"
      />
      <Tooltip
        id="tooltip-text-size-btn"
        place="top"
        delayShow={300}
        className="tooltip !text-xs z-99"
      />
    </div>
  );
 }
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
@ -12,6 +12,7 @@ import AvailableAgentsButton, {
  useAvailableAgents,
 } from "./AgentMenu";
 import TextSizeButton from "./TextSizeMenu";
 import SpeechToText from "./SpeechToText";
 export const PROMPT_INPUT_EVENT = "set_prompt_input";
 export default function PromptInput({
@ -34,6 +35,7 @@ export default function PromptInput({
  function handlePromptUpdate(e) {
    setPromptInput(e?.detail ?? "");
  }
  useEffect(() => {
    if (!!window)
      window.addEventListener(PROMPT_INPUT_EVENT, handlePromptUpdate);
@ -156,6 +158,9 @@ export default function PromptInput({
                />
                <TextSizeButton />
              </div>
              <div className="flex gap-x-2">
                <SpeechToText sendCommand={sendCommand} />
              </div>
            </div>
          </div>
        </div>
--- a/frontend/src/media/ttsproviders/elevenlabs.png
+++ b/frontend/src/media/ttsproviders/elevenlabs.png
--- a/frontend/src/models/system.js
+++ b/frontend/src/models/system.js
@ -332,7 +332,7 @@ const System = {
      })
      .then((blob) => (blob ? URL.createObjectURL(blob) : null))
      .catch((e) => {
-        console.log(e);
+        // console.log(e);
        return null;
      });
  },
--- a/frontend/src/models/workspace.js
+++ b/frontend/src/models/workspace.js
@ -272,6 +272,21 @@ const Workspace = {
        return false;
      });
  },
  ttsMessage: async function (slug, chatId) {
    return await fetch(`${API_BASE}/workspace/${slug}/tts/${chatId}`, {
      method: "GET",
      cache: "no-cache",
      headers: baseHeaders(),
    })
      .then((res) => {
        if (res.ok && res.status !== 204) return res.blob();
        throw new Error("Failed to fetch TTS.");
      })
      .then((blob) => (blob ? URL.createObjectURL(blob) : null))
      .catch((e) => {
        return null;
      });
  },
  threads: WorkspaceThread,
  uploadPfp: async function (formData, slug) {
@ -302,7 +317,7 @@ const Workspace = {
      })
      .then((blob) => (blob ? URL.createObjectURL(blob) : null))
      .catch((e) => {
-        console.log(e);
+        // console.log(e);
        return null;
      });
  },
--- a/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx
@ -0,0 +1,45 @@
 import React, { useEffect, useState, useRef } from "react";
 import { isMobile } from "react-device-detect";
 import Sidebar from "@/components/SettingsSidebar";
 import System from "@/models/system";
 import PreLoader from "@/components/Preloader";
 import SpeechToTextProvider from "./stt";
 import TextToSpeechProvider from "./tts";
 export default function AudioPreference() {
  const [settings, setSettings] = useState(null);
  const [loading, setLoading] = useState(true);
  useEffect(() => {
    async function fetchKeys() {
      const _settings = await System.keys();
      setSettings(_settings);
      setLoading(false);
    }
    fetchKeys();
  }, []);
  return (
    <div className="w-screen h-screen overflow-hidden bg-sidebar flex">
      <Sidebar />
      {loading ? (
        <div
          style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
          className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
        >
          <div className="w-full h-full flex justify-center items-center">
            <PreLoader />
          </div>
        </div>
      ) : (
        <div
          style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
          className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
        >
          <SpeechToTextProvider settings={settings} />
          <TextToSpeechProvider settings={settings} />
        </div>
      )}
    </div>
  );
 }
--- a/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx
@ -0,0 +1,191 @@
 import React, { useEffect, useState, useRef } from "react";
 import System from "@/models/system";
 import showToast from "@/utils/toast";
 import LLMItem from "@/components/LLMSelection/LLMItem";
 import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react";
 import CTAButton from "@/components/lib/CTAButton";
 import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
 import BrowserNative from "@/components/SpeechToText/BrowserNative";
 const PROVIDERS = [
  {
    name: "System native",
    value: "native",
    logo: AnythingLLMIcon,
    options: (settings) => <BrowserNative settings={settings} />,
    description: "Uses your browser's built in STT service if supported.",
  },
 ];
 export default function SpeechToTextProvider({ settings }) {
  const [saving, setSaving] = useState(false);
  const [hasChanges, setHasChanges] = useState(false);
  const [searchQuery, setSearchQuery] = useState("");
  const [filteredProviders, setFilteredProviders] = useState([]);
  const [selectedProvider, setSelectedProvider] = useState(
    settings?.SpeechToTextProvider || "native"
  );
  const [searchMenuOpen, setSearchMenuOpen] = useState(false);
  const searchInputRef = useRef(null);
  const handleSubmit = async (e) => {
    e.preventDefault();
    const form = e.target;
    const data = { SpeechToTextProvider: selectedProvider };
    const formData = new FormData(form);
    for (var [key, value] of formData.entries()) data[key] = value;
    const { error } = await System.updateSystem(data);
    setSaving(true);
    if (error) {
      showToast(`Failed to save preferences: ${error}`, "error");
    } else {
      showToast("Speech-to-text preferences saved successfully.", "success");
    }
    setSaving(false);
    setHasChanges(!!error);
  };
  const updateProviderChoice = (selection) => {
    setSearchQuery("");
    setSelectedProvider(selection);
    setSearchMenuOpen(false);
    setHasChanges(true);
  };
  const handleXButton = () => {
    if (searchQuery.length > 0) {
      setSearchQuery("");
      if (searchInputRef.current) searchInputRef.current.value = "";
    } else {
      setSearchMenuOpen(!searchMenuOpen);
    }
  };
  useEffect(() => {
    const filtered = PROVIDERS.filter((provider) =>
      provider.name.toLowerCase().includes(searchQuery.toLowerCase())
    );
    setFilteredProviders(filtered);
  }, [searchQuery, selectedProvider]);
  const selectedProviderObject = PROVIDERS.find(
    (provider) => provider.value === selectedProvider
  );
  return (
    <form onSubmit={handleSubmit} className="flex w-full">
      <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16">
        <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10">
          <div className="flex gap-x-4 items-center">
            <p className="text-lg leading-6 font-bold text-white">
              Speech-to-text Preference
            </p>
          </div>
          <p className="text-xs leading-[18px] font-base text-white text-opacity-60">
            Here you can specify what kind of text-to-speech and speech-to-text
            providers you would want to use in your AnythingLLM experience. By
            default, we use the browser's built in support for these services,
            but you may want to use others.
          </p>
        </div>
        <div className="w-full justify-end flex">
          {hasChanges && (
            <CTAButton
              onClick={() => handleSubmit()}
              className="mt-3 mr-0 -mb-14 z-10"
            >
              {saving ? "Saving..." : "Save changes"}
            </CTAButton>
          )}
        </div>
        <div className="text-base font-bold text-white mt-6 mb-4">Provider</div>
        <div className="relative">
          {searchMenuOpen && (
            <div
              className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10"
              onClick={() => setSearchMenuOpen(false)}
            />
          )}
          {searchMenuOpen ? (
            <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20">
              <div className="w-full flex flex-col gap-y-1">
                <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]">
                  <MagnifyingGlass
                    size={20}
                    weight="bold"
                    className="absolute left-4 z-30 text-white -ml-4 my-2"
                  />
                  <input
                    type="text"
                    name="stt-provider-search"
                    autoComplete="off"
                    placeholder="Search speech to text providers"
                    className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium"
                    onChange={(e) => setSearchQuery(e.target.value)}
                    ref={searchInputRef}
                    onKeyDown={(e) => {
                      if (e.key === "Enter") e.preventDefault();
                    }}
                  />
                  <X
                    size={20}
                    weight="bold"
                    className="cursor-pointer text-white hover:text-[#9CA3AF]"
                    onClick={handleXButton}
                  />
                </div>
                <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4">
                  {filteredProviders.map((provider) => (
                    <LLMItem
                      key={provider.name}
                      name={provider.name}
                      value={provider.value}
                      image={provider.logo}
                      description={provider.description}
                      checked={selectedProvider === provider.value}
                      onClick={() => updateProviderChoice(provider.value)}
                    />
                  ))}
                </div>
              </div>
            </div>
          ) : (
            <button
              className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300"
              type="button"
              onClick={() => setSearchMenuOpen(true)}
            >
              <div className="flex gap-x-4 items-center">
                <img
                  src={selectedProviderObject.logo}
                  alt={`${selectedProviderObject.name} logo`}
                  className="w-10 h-10 rounded-md"
                />
                <div className="flex flex-col text-left">
                  <div className="text-sm font-semibold text-white">
                    {selectedProviderObject.name}
                  </div>
                  <div className="mt-1 text-xs text-[#D2D5DB]">
                    {selectedProviderObject.description}
                  </div>
                </div>
              </div>
              <CaretUpDown size={24} weight="bold" className="text-white" />
            </button>
          )}
        </div>
        <div
          onChange={() => setHasChanges(true)}
          className="mt-4 flex flex-col gap-y-1"
        >
          {selectedProvider &&
            PROVIDERS.find(
              (provider) => provider.value === selectedProvider
            )?.options(settings)}
        </div>
      </div>
    </form>
  );
 }
--- a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
@ -0,0 +1,209 @@
 import React, { useEffect, useState, useRef } from "react";
 import System from "@/models/system";
 import showToast from "@/utils/toast";
 import LLMItem from "@/components/LLMSelection/LLMItem";
 import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react";
 import CTAButton from "@/components/lib/CTAButton";
 import OpenAiLogo from "@/media/llmprovider/openai.png";
 import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
 import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png";
 import BrowserNative from "@/components/TextToSpeech/BrowserNative";
 import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions";
 import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions";
 const PROVIDERS = [
  {
    name: "System native",
    value: "native",
    logo: AnythingLLMIcon,
    options: (settings) => <BrowserNative settings={settings} />,
    description: "Uses your browser's built in TTS service if supported.",
  },
  {
    name: "OpenAI",
    value: "openai",
    logo: OpenAiLogo,
    options: (settings) => <OpenAiTTSOptions settings={settings} />,
    description: "Use OpenAI's text to speech voices.",
  },
  {
    name: "ElevenLabs",
    value: "elevenlabs",
    logo: ElevenLabsIcon,
    options: (settings) => <ElevenLabsTTSOptions settings={settings} />,
    description: "Use ElevenLabs's text to speech voices and technology.",
  },
 ];
 export default function TextToSpeechProvider({ settings }) {
  const [saving, setSaving] = useState(false);
  const [hasChanges, setHasChanges] = useState(false);
  const [searchQuery, setSearchQuery] = useState("");
  const [filteredProviders, setFilteredProviders] = useState([]);
  const [selectedProvider, setSelectedProvider] = useState(
    settings?.TextToSpeechProvider || "native"
  );
  const [searchMenuOpen, setSearchMenuOpen] = useState(false);
  const searchInputRef = useRef(null);
  const handleSubmit = async (e) => {
    e.preventDefault();
    const form = e.target;
    const data = { TextToSpeechProvider: selectedProvider };
    const formData = new FormData(form);
    for (var [key, value] of formData.entries()) data[key] = value;
    const { error } = await System.updateSystem(data);
    setSaving(true);
    if (error) {
      showToast(`Failed to save preferences: ${error}`, "error");
    } else {
      showToast("Text-to-speech preferences saved successfully.", "success");
    }
    setSaving(false);
    setHasChanges(!!error);
  };
  const updateProviderChoice = (selection) => {
    setSearchQuery("");
    setSelectedProvider(selection);
    setSearchMenuOpen(false);
    setHasChanges(true);
  };
  const handleXButton = () => {
    if (searchQuery.length > 0) {
      setSearchQuery("");
      if (searchInputRef.current) searchInputRef.current.value = "";
    } else {
      setSearchMenuOpen(!searchMenuOpen);
    }
  };
  useEffect(() => {
    const filtered = PROVIDERS.filter((provider) =>
      provider.name.toLowerCase().includes(searchQuery.toLowerCase())
    );
    setFilteredProviders(filtered);
  }, [searchQuery, selectedProvider]);
  const selectedProviderObject = PROVIDERS.find(
    (provider) => provider.value === selectedProvider
  );
  return (
    <form onSubmit={handleSubmit} className="flex w-full">
      <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16">
        <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10">
          <div className="flex gap-x-4 items-center">
            <p className="text-lg leading-6 font-bold text-white">
              Text-to-speech Preference
            </p>
          </div>
          <p className="text-xs leading-[18px] font-base text-white text-opacity-60">
            Here you can specify what kind of text-to-speech providers you would
            want to use in your AnythingLLM experience. By default, we use the
            browser's built in support for these services, but you may want to
            use others.
          </p>
        </div>
        <div className="w-full justify-end flex">
          {hasChanges && (
            <CTAButton
              onClick={() => handleSubmit()}
              className="mt-3 mr-0 -mb-14 z-10"
            >
              {saving ? "Saving..." : "Save changes"}
            </CTAButton>
          )}
        </div>
        <div className="text-base font-bold text-white mt-6 mb-4">Provider</div>
        <div className="relative">
          {searchMenuOpen && (
            <div
              className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10"
              onClick={() => setSearchMenuOpen(false)}
            />
          )}
          {searchMenuOpen ? (
            <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20">
              <div className="w-full flex flex-col gap-y-1">
                <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]">
                  <MagnifyingGlass
                    size={20}
                    weight="bold"
                    className="absolute left-4 z-30 text-white -ml-4 my-2"
                  />
                  <input
                    type="text"
                    name="tts-provider-search"
                    autoComplete="off"
                    placeholder="Search text to speech providers"
                    className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium"
                    onChange={(e) => setSearchQuery(e.target.value)}
                    ref={searchInputRef}
                    onKeyDown={(e) => {
                      if (e.key === "Enter") e.preventDefault();
                    }}
                  />
                  <X
                    size={20}
                    weight="bold"
                    className="cursor-pointer text-white hover:text-[#9CA3AF]"
                    onClick={handleXButton}
                  />
                </div>
                <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4">
                  {filteredProviders.map((provider) => (
                    <LLMItem
                      key={provider.name}
                      name={provider.name}
                      value={provider.value}
                      image={provider.logo}
                      description={provider.description}
                      checked={selectedProvider === provider.value}
                      onClick={() => updateProviderChoice(provider.value)}
                    />
                  ))}
                </div>
              </div>
            </div>
          ) : (
            <button
              className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300"
              type="button"
              onClick={() => setSearchMenuOpen(true)}
            >
              <div className="flex gap-x-4 items-center">
                <img
                  src={selectedProviderObject.logo}
                  alt={`${selectedProviderObject.name} logo`}
                  className="w-10 h-10 rounded-md"
                />
                <div className="flex flex-col text-left">
                  <div className="text-sm font-semibold text-white">
                    {selectedProviderObject.name}
                  </div>
                  <div className="mt-1 text-xs text-[#D2D5DB]">
                    {selectedProviderObject.description}
                  </div>
                </div>
              </div>
              <CaretUpDown size={24} weight="bold" className="text-white" />
            </button>
          )}
        </div>
        <div
          onChange={() => setHasChanges(true)}
          className="mt-4 flex flex-col gap-y-1"
        >
          {selectedProvider &&
            PROVIDERS.find(
              (provider) => provider.value === selectedProvider
            )?.options(settings)}
        </div>
      </div>
    </form>
  );
 }
--- a/frontend/src/utils/paths.js
+++ b/frontend/src/utils/paths.js
@ -98,6 +98,9 @@ export default {
    transcriptionPreference: () => {
      return "/settings/transcription-preference";
    },
    audioPreference: () => {
      return "/settings/audio-preference";
    },
    embedder: {
      modelPreference: () => "/settings/embedding-preference",
      chunkingPreference: () => "/settings/text-splitter-preference",
--- a/frontend/yarn.lock
+++ b/frontend/yarn.lock
@ -2841,6 +2841,11 @@ react-smooth@^4.0.0:
    prop-types "^15.8.1"
    react-transition-group "^4.4.5"
 react-speech-recognition@^3.10.0:
  version "3.10.0"
  resolved "https://registry.yarnpkg.com/react-speech-recognition/-/react-speech-recognition-3.10.0.tgz#7aa43bb28d78b92671864dabba3a70489ccad27b"
  integrity sha512-EVSr4Ik8l9urwdPiK2r0+ADrLyDDrjB0qBRdUWO+w2MfwEBrj6NuRmy1GD3x7BU/V6/hab0pl8Lupen0zwlJyw==
 react-tag-input-component@^2.0.2:
  version "2.0.2"
  resolved "https://registry.yarnpkg.com/react-tag-input-component/-/react-tag-input-component-2.0.2.tgz#f62f013c6a535141dd1c6c3a88858223170150f1"
--- a/server/.env.example
+++ b/server/.env.example
@ -168,6 +168,19 @@ WHISPER_PROVIDER="local"
 # WHISPER_PROVIDER="openai"
 # OPEN_AI_KEY=sk-xxxxxxxx
 ###########################################
 ######## TTS/STT Model Selection ##########
 ###########################################
 TTS_PROVIDER="native"
 # TTS_PROVIDER="openai"
 # TTS_OPEN_AI_KEY=sk-example
 # TTS_OPEN_AI_VOICE_MODEL=nova
 # TTS_PROVIDER="elevenlabs"
 # TTS_ELEVEN_LABS_KEY=
 # TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # STORAGE_DIR= # absolute filesystem path with no trailing slash
--- a/server/endpoints/workspaces.js
+++ b/server/endpoints/workspaces.js
@ -1,6 +1,11 @@
 const path = require("path");
 const fs = require("fs");
-const { reqBody, multiUserMode, userFromSession } = require("../utils/http");
+const {
  reqBody,
  multiUserMode,
  userFromSession,
  safeJsonParse,
 } = require("../utils/http");
 const { normalizePath } = require("../utils/files");
 const { Workspace } = require("../models/workspace");
 const { Document } = require("../models/documents");
@ -25,6 +30,7 @@ const {
  determineWorkspacePfpFilepath,
  fetchPfp,
 } = require("../utils/files/pfp");
 const { getTTSProvider } = require("../utils/TextToSpeech");
 function workspaceEndpoints(app) {
  if (!app) return;
@ -506,6 +512,48 @@ function workspaceEndpoints(app) {
    }
  );
  app.get(
    "/workspace/:slug/tts/:chatId",
    [validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug],
    async function (request, response) {
      try {
        const { chatId } = request.params;
        const workspace = response.locals.workspace;
        const cacheKey = `${workspace.slug}:${chatId}`;
        const wsChat = await WorkspaceChats.get({
          id: Number(chatId),
          workspaceId: workspace.id,
        });
        const cachedResponse = responseCache.get(cacheKey);
        if (cachedResponse) {
          response.writeHead(200, {
            "Content-Type": cachedResponse.mime || "audio/mpeg",
          });
          response.end(cachedResponse.buffer);
          return;
        }
        const text = safeJsonParse(wsChat.response, null)?.text;
        if (!text) return response.sendStatus(204).end();
        const TTSProvider = getTTSProvider();
        const buffer = await TTSProvider.ttsBuffer(text);
        if (buffer === null) return response.sendStatus(204).end();
        responseCache.set(cacheKey, { buffer, mime: "audio/mpeg" });
        response.writeHead(200, {
          "Content-Type": "audio/mpeg",
        });
        response.end(buffer);
        return;
      } catch (error) {
        console.error("Error processing the TTS request:", error);
        response.status(500).json({ message: "TTS could not be completed" });
      }
    }
  );
  app.get(
    "/workspace/:slug/pfp",
    [validatedRequest, flexUserRoleValid([ROLES.all])],
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -131,6 +131,17 @@ const SystemSettings = {
      // --------------------------------------------------------
      WhisperProvider: process.env.WHISPER_PROVIDER || "local",
      // --------------------------------------------------------
      // TTS/STT  Selection Settings & Configs
      // - Currently the only 3rd party is OpenAI or the native browser-built in
      // --------------------------------------------------------
      TextToSpeechProvider: process.env.TTS_PROVIDER || "native",
      TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY,
      TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL,
      // Eleven Labs TTS
      TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY,
      TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL,
      // --------------------------------------------------------
      // Agent Settings & Configs
      // --------------------------------------------------------
--- a/server/package.json
+++ b/server/package.json
@ -44,6 +44,7 @@
    "cohere-ai": "^7.9.5",
    "cors": "^2.8.5",
    "dotenv": "^16.0.3",
    "elevenlabs": "^0.5.0",
    "express": "^4.18.2",
    "express-ws": "^5.0.2",
    "extract-json-from-string": "^1.0.1",
--- a/server/utils/TextToSpeech/elevenLabs/index.js
+++ b/server/utils/TextToSpeech/elevenLabs/index.js
@ -0,0 +1,54 @@
 const { ElevenLabsClient, stream } = require("elevenlabs");
 class ElevenLabsTTS {
  constructor() {
    if (!process.env.TTS_ELEVEN_LABS_KEY)
      throw new Error("No ElevenLabs API key was set.");
    this.elevenLabs = new ElevenLabsClient({
      apiKey: process.env.TTS_ELEVEN_LABS_KEY,
    });
    // Rachel as default voice
    // https://api.elevenlabs.io/v1/voices
    this.voiceId =
      process.env.TTS_ELEVEN_LABS_VOICE_MODEL ?? "21m00Tcm4TlvDq8ikWAM";
    this.modelId = "eleven_multilingual_v2";
  }
  static async voices(apiKey = null) {
    try {
      const client = new ElevenLabsClient({
        apiKey: apiKey ?? process.env.TTS_ELEVEN_LABS_KEY ?? null,
      });
      return (await client.voices.getAll())?.voices ?? [];
    } catch {}
    return [];
  }
  #stream2buffer(stream) {
    return new Promise((resolve, reject) => {
      const _buf = [];
      stream.on("data", (chunk) => _buf.push(chunk));
      stream.on("end", () => resolve(Buffer.concat(_buf)));
      stream.on("error", (err) => reject(err));
    });
  }
  async ttsBuffer(textInput) {
    try {
      const audio = await this.elevenLabs.generate({
        voice: this.voiceId,
        text: textInput,
        model_id: "eleven_multilingual_v2",
      });
      return Buffer.from(await this.#stream2buffer(audio));
    } catch (e) {
      console.error(e);
    }
    return null;
  }
 }
 module.exports = {
  ElevenLabsTTS,
 };
--- a/server/utils/TextToSpeech/index.js
+++ b/server/utils/TextToSpeech/index.js
@ -0,0 +1,15 @@
 function getTTSProvider() {
  const provider = process.env.TTS_PROVIDER || "openai";
  switch (provider) {
    case "openai":
      const { OpenAiTTS } = require("./openAi");
      return new OpenAiTTS();
    case "elevenlabs":
      const { ElevenLabsTTS } = require("./elevenLabs");
      return new ElevenLabsTTS();
    default:
      throw new Error("ENV: No TTS_PROVIDER value found in environment!");
  }
 }
 module.exports = { getTTSProvider };
--- a/server/utils/TextToSpeech/openAi/index.js
+++ b/server/utils/TextToSpeech/openAi/index.js
@ -0,0 +1,29 @@
 class OpenAiTTS {
  constructor() {
    if (!process.env.TTS_OPEN_AI_KEY)
      throw new Error("No OpenAI API key was set.");
    const { OpenAI: OpenAIApi } = require("openai");
    this.openai = new OpenAIApi({
      apiKey: process.env.TTS_OPEN_AI_KEY,
    });
    this.voice = process.env.TTS_OPEN_AI_VOICE_MODEL ?? "alloy";
  }
  async ttsBuffer(textInput) {
    try {
      const result = await this.openai.audio.speech.create({
        model: "tts-1",
        voice: this.voice,
        input: textInput,
      });
      return Buffer.from(await result.arrayBuffer());
    } catch (e) {
      console.error(e);
    }
    return null;
  }
 }
 module.exports = {
  OpenAiTTS,
 };
--- a/server/utils/helpers/customModels.js
+++ b/server/utils/helpers/customModels.js
@ -4,6 +4,7 @@ const {
 } = require("../AiProviders/openRouter");
 const { perplexityModels } = require("../AiProviders/perplexity");
 const { togetherAiModels } = require("../AiProviders/togetherAi");
 const { ElevenLabsTTS } = require("../TextToSpeech/elevenLabs");
 const SUPPORT_CUSTOM_MODELS = [
  "openai",
  "localai",
@ -15,6 +16,7 @@ const SUPPORT_CUSTOM_MODELS = [
  "openrouter",
  "lmstudio",
  "koboldcpp",
  "elevenlabs-tts",
 ];
 async function getCustomModels(provider = "", apiKey = null, basePath = null) {
@ -42,6 +44,8 @@ async function getCustomModels(provider = "", apiKey = null, basePath = null) {
      return await getLMStudioModels(basePath);
    case "koboldcpp":
      return await getKoboldCPPModels(basePath);
    case "elevenlabs-tts":
      return await getElevenLabsModels(apiKey);
    default:
      return { models: [], error: "Invalid provider for custom models" };
  }
@ -321,6 +325,32 @@ function nativeLLMModels() {
  return { models: files, error: null };
 }
 async function getElevenLabsModels(apiKey = null) {
  const models = (await ElevenLabsTTS.voices(apiKey)).map((model) => {
    return {
      id: model.voice_id,
      organization: model.category,
      name: model.name,
    };
  });
  if (models.length === 0) {
    return {
      models: [
        {
          id: "21m00Tcm4TlvDq8ikWAM",
          organization: "premade",
          name: "Rachel (default)",
        },
      ],
      error: null,
    };
  }
  if (models.length > 0 && !!apiKey) process.env.TTS_ELEVEN_LABS_KEY = apiKey;
  return { models, error: null };
 }
 module.exports = {
  getCustomModels,
 };
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -366,6 +366,32 @@ const KEY_MAPPING = {
    envKey: "AGENT_SERPER_DEV_KEY",
    checks: [],
  },
  // TTS/STT Integration ENVS
  TextToSpeechProvider: {
    envKey: "TTS_PROVIDER",
    checks: [supportedTTSProvider],
  },
  // TTS OpenAI
  TTSOpenAIKey: {
    envKey: "TTS_OPEN_AI_KEY",
    checks: [validOpenAIKey],
  },
  TTSOpenAIVoiceModel: {
    envKey: "TTS_OPEN_AI_VOICE_MODEL",
    checks: [],
  },
  // TTS ElevenLabs
  TTSElevenLabsKey: {
    envKey: "TTS_ELEVEN_LABS_KEY",
    checks: [isNotEmpty],
  },
  TTSElevenLabsVoiceModel: {
    envKey: "TTS_ELEVEN_LABS_VOICE_MODEL",
    checks: [],
  },
 };
 function isNotEmpty(input = "") {
@ -419,6 +445,11 @@ function validOllamaLLMBasePath(input = "") {
  }
 }
 function supportedTTSProvider(input = "") {
  const validSelection = ["native", "openai", "elevenlabs"].includes(input);
  return validSelection ? null : `${input} is not a valid TTS provider.`;
 }
 function supportedLLM(input = "") {
  const validSelection = [
    "openai",
--- a/server/yarn.lock
+++ b/server/yarn.lock
@ -1901,6 +1901,11 @@ combined-stream@^1.0.8:
  dependencies:
    delayed-stream "~1.0.0"
 command-exists@^1.2.9:
  version "1.2.9"
  resolved "https://registry.yarnpkg.com/command-exists/-/command-exists-1.2.9.tgz#c50725af3808c8ab0260fd60b01fbfa25b954f69"
  integrity sha512-LTQ/SGc+s0Xc0Fu5WaKnR0YiygZkm9eKFvyS+fRsU7/ZWFF8ykFM6Pc9aCVf1+xasOOZpO3BAVgVrKvsqKHV7w==
 command-line-args@5.2.1, command-line-args@^5.2.1:
  version "5.2.1"
  resolved "https://registry.yarnpkg.com/command-line-args/-/command-line-args-5.2.1.tgz#c44c32e437a57d7c51157696893c5909e9cec42e"
@ -2255,6 +2260,18 @@ ee-first@1.1.1:
  resolved "https://registry.yarnpkg.com/ee-first/-/ee-first-1.1.1.tgz#590c61156b0ae2f4f0255732a158b266bc56b21d"
  integrity sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==
 elevenlabs@^0.5.0:
  version "0.5.0"
  resolved "https://registry.yarnpkg.com/elevenlabs/-/elevenlabs-0.5.0.tgz#07eb1a943b0ab99b925875bd5c57833a3a024e58"
  integrity sha512-jfex4ecuWIlyAUuMrMJAJNa5MLziqYQOCDw4ZYuoc9PCYLxtHwaYBWpZoDhnYMcceLI7rRRvmbLMcT9HlVMfHA==
  dependencies:
    command-exists "^1.2.9"
    execa "^5.1.1"
    form-data "4.0.0"
    node-fetch "2.7.0"
    qs "6.11.2"
    url-join "4.0.1"
 emoji-regex@^10.2.1:
  version "10.3.0"
  resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-10.3.0.tgz#76998b9268409eb3dae3de989254d456e70cfe23"
@ -2605,6 +2622,21 @@ eventemitter3@^4.0.4:
  resolved "https://registry.yarnpkg.com/eventemitter3/-/eventemitter3-4.0.7.tgz#2de9b68f6528d5644ef5c59526a1b4a07306169f"
  integrity sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==
 execa@^5.1.1:
  version "5.1.1"
  resolved "https://registry.yarnpkg.com/execa/-/execa-5.1.1.tgz#f80ad9cbf4298f7bd1d4c9555c21e93741c411dd"
  integrity sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==
  dependencies:
    cross-spawn "^7.0.3"
    get-stream "^6.0.0"
    human-signals "^2.1.0"
    is-stream "^2.0.0"
    merge-stream "^2.0.0"
    npm-run-path "^4.0.1"
    onetime "^5.1.2"
    signal-exit "^3.0.3"
    strip-final-newline "^2.0.0"
 expand-template@^2.0.3:
  version "2.0.3"
  resolved "https://registry.yarnpkg.com/expand-template/-/expand-template-2.0.3.tgz#6e14b3fcee0f3a6340ecb57d2e8918692052a47c"
@ -3024,6 +3056,11 @@ get-stream@^5.1.0:
  dependencies:
    pump "^3.0.0"
 get-stream@^6.0.0:
  version "6.0.1"
  resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-6.0.1.tgz#a262d8eef67aced57c2852ad6167526a43cbf7b7"
  integrity sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==
 get-symbol-description@^1.0.2:
  version "1.0.2"
  resolved "https://registry.yarnpkg.com/get-symbol-description/-/get-symbol-description-1.0.2.tgz#533744d5aa20aca4e079c8e5daf7fd44202821f5"
@ -3297,6 +3334,11 @@ https-proxy-agent@^7.0.0:
    agent-base "^7.0.2"
    debug "4"
 human-signals@^2.1.0:
  version "2.1.0"
  resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0"
  integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==
 humanize-ms@^1.2.1:
  version "1.2.1"
  resolved "https://registry.yarnpkg.com/humanize-ms/-/humanize-ms-1.2.1.tgz#c46e3159a293f6b896da29316d8b6fe8bb79bbed"
@ -4092,6 +4134,11 @@ merge-descriptors@1.0.1:
  resolved "https://registry.yarnpkg.com/merge-descriptors/-/merge-descriptors-1.0.1.tgz#b00aaa556dd8b44568150ec9d1b953f3f90cbb61"
  integrity sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w==
 merge-stream@^2.0.0:
  version "2.0.0"
  resolved "https://registry.yarnpkg.com/merge-stream/-/merge-stream-2.0.0.tgz#52823629a14dd00c9770fb6ad47dc6310f2c1f60"
  integrity sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==
 methods@~1.1.2:
  version "1.1.2"
  resolved "https://registry.yarnpkg.com/methods/-/methods-1.1.2.tgz#5529a4d67654134edcc5266656835b0f851afcee"
@ -4455,6 +4502,13 @@ normalize-path@^3.0.0, normalize-path@~3.0.0:
  resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65"
  integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==
 npm-run-path@^4.0.1:
  version "4.0.1"
  resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"
  integrity sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==
  dependencies:
    path-key "^3.0.0"
 npmlog@^5.0.1:
  version "5.0.1"
  resolved "https://registry.yarnpkg.com/npmlog/-/npmlog-5.0.1.tgz#f06678e80e29419ad67ab964e0fa69959c1eb8b0"
@ -4593,7 +4647,7 @@ one-time@^1.0.0:
  dependencies:
    fn.name "1.x.x"
-onetime@^5.1.0:
+onetime@^5.1.0, onetime@^5.1.2:
  version "5.1.2"
  resolved "https://registry.yarnpkg.com/onetime/-/onetime-5.1.2.tgz#d0e96ebb56b07476df1dd9c4806e5237985ca45e"
  integrity sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==
@ -4774,7 +4828,7 @@ path-is-absolute@^1.0.0:
  resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f"
  integrity sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==
-path-key@^3.1.0:
+path-key@^3.0.0, path-key@^3.1.0:
  version "3.1.1"
  resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375"
  integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==
@ -5322,7 +5376,7 @@ side-channel@^1.0.4, side-channel@^1.0.6:
    get-intrinsic "^1.2.4"
    object-inspect "^1.13.1"
-signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.7:
+signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.3, signal-exit@^3.0.7:
  version "3.0.7"
  resolved "https://registry.yarnpkg.com/signal-exit/-/signal-exit-3.0.7.tgz#a9a1767f8af84155114eaabd73f99273c8f59ad9"
  integrity sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==
@ -5559,6 +5613,11 @@ strip-ansi@^7.0.1, strip-ansi@^7.1.0:
  dependencies:
    ansi-regex "^6.0.1"
 strip-final-newline@^2.0.0:
  version "2.0.0"
  resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad"
  integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==
 strip-json-comments@^3.1.1:
  version "3.1.1"
  resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-3.1.1.tgz#31f1281b3832630434831c310c01cccda8cbe006"