Implement support for HuggingFace Inference Endpoints (#680)

2024-11-05 06:20:10 +01:00 · 2024-02-06 09:17:51 -08:00 · 2024-02-06 09:17:51 -08:00 · 2bc11d3f1a
commit 2bc11d3f1a
parent 1846a99b93
12 changed files with 424 additions and 1 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -48,6 +48,11 @@ GID='1000'
 # MISTRAL_API_KEY='example-mistral-ai-api-key'
 # MISTRAL_MODEL_PREF='mistral-tiny'
 # LLM_PROVIDER='huggingface'
 # HUGGING_FACE_LLM_ENDPOINT=https://uuid-here.us-east-1.aws.endpoints.huggingface.cloud
 # HUGGING_FACE_LLM_API_KEY=hf_xxxxxx
 # HUGGING_FACE_LLM_TOKEN_LIMIT=8000
 ###########################################
 ######## Embedding API SElECTION ##########
 ###########################################
--- a/frontend/src/components/LLMSelection/HuggingFaceOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/HuggingFaceOptions/index.jsx
@ -0,0 +1,56 @@
 export default function HuggingFaceOptions({ settings }) {
  return (
    <div className="w-full flex flex-col">
      <div className="w-full flex items-center gap-4">
        <div className="flex flex-col w-60">
          <label className="text-white text-sm font-semibold block mb-4">
            HuggingFace Inference Endpoint
          </label>
          <input
            type="url"
            name="HuggingFaceLLMEndpoint"
            className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
            placeholder="https://example.endpoints.huggingface.cloud"
            defaultValue={settings?.HuggingFaceLLMEndpoint}
            required={true}
            autoComplete="off"
            spellCheck={false}
          />
        </div>
        <div className="flex flex-col w-60">
          <label className="text-white text-sm font-semibold block mb-4">
            HuggingFace Access Token
          </label>
          <input
            type="password"
            name="HuggingFaceLLMAccessToken"
            className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
            placeholder="HuggingFace Access Token"
            defaultValue={
              settings?.HuggingFaceLLMAccessToken ? "*".repeat(20) : ""
            }
            required={true}
            autoComplete="off"
            spellCheck={false}
          />
        </div>
        <div className="flex flex-col w-60">
          <label className="text-white text-sm font-semibold block mb-4">
            Model Token Limit
          </label>
          <input
            type="number"
            name="HuggingFaceLLMTokenLimit"
            className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
            placeholder="4096"
            min={1}
            onScroll={(e) => e.target.blur()}
            defaultValue={settings?.HuggingFaceLLMTokenLimit}
            required={true}
            autoComplete="off"
          />
        </div>
      </div>
    </div>
  );
 }
--- a/frontend/src/media/llmprovider/huggingface.png
+++ b/frontend/src/media/llmprovider/huggingface.png
--- a/frontend/src/pages/GeneralSettings/LLMPreference/index.jsx
+++ b/frontend/src/pages/GeneralSettings/LLMPreference/index.jsx
@ -13,6 +13,7 @@ import LMStudioLogo from "@/media/llmprovider/lmstudio.png";
 import LocalAiLogo from "@/media/llmprovider/localai.png";
 import TogetherAILogo from "@/media/llmprovider/togetherai.png";
 import MistralLogo from "@/media/llmprovider/mistral.jpeg";
 import HuggingFaceLogo from "@/media/llmprovider/huggingface.png";
 import PreLoader from "@/components/Preloader";
 import OpenAiOptions from "@/components/LLMSelection/OpenAiOptions";
 import AzureAiOptions from "@/components/LLMSelection/AzureAiOptions";
@ -24,6 +25,7 @@ import GeminiLLMOptions from "@/components/LLMSelection/GeminiLLMOptions";
 import OllamaLLMOptions from "@/components/LLMSelection/OllamaLLMOptions";
 import TogetherAiOptions from "@/components/LLMSelection/TogetherAiOptions";
 import MistralOptions from "@/components/LLMSelection/MistralOptions";
 import HuggingFaceOptions from "@/components/LLMSelection/HuggingFaceOptions";
 import LLMItem from "@/components/LLMSelection/LLMItem";
 import { MagnifyingGlass } from "@phosphor-icons/react";
@ -107,6 +109,14 @@ export default function GeneralLLMPreference() {
      options: <GeminiLLMOptions settings={settings} />,
      description: "Google's largest and most capable AI model",
    },
    {
      name: "HuggingFace",
      value: "huggingface",
      logo: HuggingFaceLogo,
      options: <HuggingFaceOptions settings={settings} />,
      description:
        "Access 150,000+ open-source LLMs and the world's AI community",
    },
    {
      name: "Ollama",
      value: "ollama",
--- a/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx
+++ b/frontend/src/pages/OnboardingFlow/Steps/DataHandling/index.jsx
@ -10,6 +10,7 @@ import TogetherAILogo from "@/media/llmprovider/togetherai.png";
 import LMStudioLogo from "@/media/llmprovider/lmstudio.png";
 import LocalAiLogo from "@/media/llmprovider/localai.png";
 import MistralLogo from "@/media/llmprovider/mistral.jpeg";
 import HuggingFaceLogo from "@/media/llmprovider/huggingface.png";
 import ZillizLogo from "@/media/vectordbs/zilliz.png";
 import AstraDBLogo from "@/media/vectordbs/astraDB.png";
 import ChromaLogo from "@/media/vectordbs/chroma.png";
@ -101,6 +102,13 @@ const LLM_SELECTION_PRIVACY = {
    ],
    logo: MistralLogo,
  },
  huggingface: {
    name: "HuggingFace",
    description: [
      "Your prompts and document text used in response are sent to your HuggingFace managed endpoint",
    ],
    logo: HuggingFaceLogo,
  },
 };
 const VECTOR_DB_PRIVACY = {
--- a/frontend/src/pages/OnboardingFlow/Steps/LLMPreference/index.jsx
+++ b/frontend/src/pages/OnboardingFlow/Steps/LLMPreference/index.jsx
@ -10,6 +10,7 @@ import LocalAiLogo from "@/media/llmprovider/localai.png";
 import TogetherAILogo from "@/media/llmprovider/togetherai.png";
 import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
 import MistralLogo from "@/media/llmprovider/mistral.jpeg";
 import HuggingFaceLogo from "@/media/llmprovider/huggingface.png";
 import OpenAiOptions from "@/components/LLMSelection/OpenAiOptions";
 import AzureAiOptions from "@/components/LLMSelection/AzureAiOptions";
 import AnthropicAiOptions from "@/components/LLMSelection/AnthropicAiOptions";
@ -19,6 +20,7 @@ import NativeLLMOptions from "@/components/LLMSelection/NativeLLMOptions";
 import GeminiLLMOptions from "@/components/LLMSelection/GeminiLLMOptions";
 import OllamaLLMOptions from "@/components/LLMSelection/OllamaLLMOptions";
 import MistralOptions from "@/components/LLMSelection/MistralOptions";
 import HuggingFaceOptions from "@/components/LLMSelection/HuggingFaceOptions";
 import LLMItem from "@/components/LLMSelection/LLMItem";
 import System from "@/models/system";
 import paths from "@/utils/paths";
@ -82,6 +84,14 @@ export default function LLMPreference({
      options: <GeminiLLMOptions settings={settings} />,
      description: "Google's largest and most capable AI model",
    },
    {
      name: "HuggingFace",
      value: "huggingface",
      logo: HuggingFaceLogo,
      options: <HuggingFaceOptions settings={settings} />,
      description:
        "Access 150,000+ open-source LLMs and the world's AI community",
    },
    {
      name: "Ollama",
      value: "ollama",
--- a/server/.env.example
+++ b/server/.env.example
@ -45,6 +45,11 @@ JWT_SECRET="my-random-string-for-seeding" # Please generate random string at lea
 # MISTRAL_API_KEY='example-mistral-ai-api-key'
 # MISTRAL_MODEL_PREF='mistral-tiny'
 # LLM_PROVIDER='huggingface'
 # HUGGING_FACE_LLM_ENDPOINT=https://uuid-here.us-east-1.aws.endpoints.huggingface.cloud
 # HUGGING_FACE_LLM_API_KEY=hf_xxxxxx
 # HUGGING_FACE_LLM_TOKEN_LIMIT=8000
 ###########################################
 ######## Embedding API SElECTION ##########
 ###########################################
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -194,6 +194,20 @@ const SystemSettings = {
            AzureOpenAiEmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
          }
        : {}),
      ...(llmProvider === "huggingface"
        ? {
            HuggingFaceLLMEndpoint: process.env.HUGGING_FACE_LLM_ENDPOINT,
            HuggingFaceLLMAccessToken: !!process.env.HUGGING_FACE_LLM_API_KEY,
            HuggingFaceLLMTokenLimit: process.env.HUGGING_FACE_LLM_TOKEN_LIMIT,
            // For embedding credentials when Anthropic is selected.
            OpenAiKey: !!process.env.OPEN_AI_KEY,
            AzureOpenAiEndpoint: process.env.AZURE_OPENAI_ENDPOINT,
            AzureOpenAiKey: !!process.env.AZURE_OPENAI_KEY,
            AzureOpenAiEmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
          }
        : {}),
    };
  },
--- a/server/utils/AiProviders/huggingface/index.js
+++ b/server/utils/AiProviders/huggingface/index.js
@ -0,0 +1,185 @@
 const { NativeEmbedder } = require("../../EmbeddingEngines/native");
 const { OpenAiEmbedder } = require("../../EmbeddingEngines/openAi");
 const { chatPrompt } = require("../../chats");
 class HuggingFaceLLM {
  constructor(embedder = null, _modelPreference = null) {
    const { Configuration, OpenAIApi } = require("openai");
    if (!process.env.HUGGING_FACE_LLM_ENDPOINT)
      throw new Error("No HuggingFace Inference Endpoint was set.");
    if (!process.env.HUGGING_FACE_LLM_API_KEY)
      throw new Error("No HuggingFace Access Token was set.");
    const config = new Configuration({
      basePath: `${process.env.HUGGING_FACE_LLM_ENDPOINT}/v1`,
      apiKey: process.env.HUGGING_FACE_LLM_API_KEY,
    });
    this.openai = new OpenAIApi(config);
    // When using HF inference server - the model param is not required so
    // we can stub it here. HF Endpoints can only run one model at a time.
    // We set to 'tgi' so that endpoint for HF can accept message format
    this.model = "tgi";
    this.limits = {
      history: this.promptWindowLimit() * 0.15,
      system: this.promptWindowLimit() * 0.15,
      user: this.promptWindowLimit() * 0.7,
    };
    if (!embedder)
      console.warn(
        "No embedding provider defined for HuggingFaceLLM - falling back to Native for embedding!"
      );
    this.embedder = !embedder ? new OpenAiEmbedder() : new NativeEmbedder();
    this.defaultTemp = 0.2;
  }
  #appendContext(contextTexts = []) {
    if (!contextTexts || !contextTexts.length) return "";
    return (
      "\nContext:\n" +
      contextTexts
        .map((text, i) => {
          return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
        })
        .join("")
    );
  }
  streamingEnabled() {
    return "streamChat" in this && "streamGetChatCompletion" in this;
  }
  promptWindowLimit() {
    const limit = process.env.HUGGING_FACE_LLM_TOKEN_LIMIT || 4096;
    if (!limit || isNaN(Number(limit)))
      throw new Error("No HuggingFace token context limit was set.");
    return Number(limit);
  }
  async isValidChatCompletionModel(_ = "") {
    return true;
  }
  constructPrompt({
    systemPrompt = "",
    contextTexts = [],
    chatHistory = [],
    userPrompt = "",
  }) {
    // System prompt it not enabled for HF model chats
    const prompt = {
      role: "user",
      content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
    };
    const assistantResponse = {
      role: "assistant",
      content: "Okay, I will follow those instructions",
    };
    return [
      prompt,
      assistantResponse,
      ...chatHistory,
      { role: "user", content: userPrompt },
    ];
  }
  async isSafe(_input = "") {
    // Not implemented so must be stubbed
    return { safe: true, reasons: [] };
  }
  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
    const textResponse = await this.openai
      .createChatCompletion({
        model: this.model,
        temperature: Number(workspace?.openAiTemp ?? this.defaultTemp),
        n: 1,
        messages: await this.compressMessages(
          {
            systemPrompt: chatPrompt(workspace),
            userPrompt: prompt,
            chatHistory,
          },
          rawHistory
        ),
      })
      .then((json) => {
        const res = json.data;
        if (!res.hasOwnProperty("choices"))
          throw new Error("HuggingFace chat: No results!");
        if (res.choices.length === 0)
          throw new Error("HuggingFace chat: No results length!");
        return res.choices[0].message.content;
      })
      .catch((error) => {
        throw new Error(
          `HuggingFace::createChatCompletion failed with: ${error.message}`
        );
      });
    return textResponse;
  }
  async streamChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
    const streamRequest = await this.openai.createChatCompletion(
      {
        model: this.model,
        stream: true,
        temperature: Number(workspace?.openAiTemp ?? this.defaultTemp),
        n: 1,
        messages: await this.compressMessages(
          {
            systemPrompt: chatPrompt(workspace),
            userPrompt: prompt,
            chatHistory,
          },
          rawHistory
        ),
      },
      { responseType: "stream" }
    );
    return { type: "huggingFaceStream", stream: streamRequest };
  }
  async getChatCompletion(messages = null, { temperature = 0.7 }) {
    const { data } = await this.openai.createChatCompletion({
      model: this.model,
      messages,
      temperature,
    });
    if (!data.hasOwnProperty("choices")) return null;
    return data.choices[0].message.content;
  }
  async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
    const streamRequest = await this.openai.createChatCompletion(
      {
        model: this.model,
        stream: true,
        messages,
        temperature,
      },
      { responseType: "stream" }
    );
    return { type: "huggingFaceStream", stream: streamRequest };
  }
  // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
  async embedTextInput(textInput) {
    return await this.embedder.embedTextInput(textInput);
  }
  async embedChunks(textChunks = []) {
    return await this.embedder.embedChunks(textChunks);
  }
  async compressMessages(promptArgs = {}, rawHistory = []) {
    const { messageArrayCompressor } = require("../../helpers/chat");
    const messageArray = this.constructPrompt(promptArgs);
    return await messageArrayCompressor(this, messageArray, rawHistory);
  }
 }
 module.exports = {
  HuggingFaceLLM,
 };
--- a/server/utils/chats/stream.js
+++ b/server/utils/chats/stream.js
@ -383,6 +383,112 @@ function handleStreamResponses(response, stream, responseProps) {
    });
  }
  if (stream.type === "huggingFaceStream") {
    return new Promise((resolve) => {
      let fullText = "";
      let chunk = "";
      stream.stream.data.on("data", (data) => {
        const lines = data
          ?.toString()
          ?.split("\n")
          .filter((line) => line.trim() !== "");
        for (const line of lines) {
          let validJSON = false;
          const message = chunk + line.replace(/^data:/, "");
          if (message !== "[DONE]") {
            // JSON chunk is incomplete and has not ended yet
            // so we need to stitch it together. You would think JSON
            // chunks would only come complete - but they don't!
            try {
              JSON.parse(message);
              validJSON = true;
            } catch {
              console.log("Failed to parse message", message);
            }
            if (!validJSON) {
              // It can be possible that the chunk decoding is running away
              // and the message chunk fails to append due to string length.
              // In this case abort the chunk and reset so we can continue.
              // ref: https://github.com/Mintplex-Labs/anything-llm/issues/416
              try {
                chunk += message;
              } catch (e) {
                console.error(`Chunk appending error`, e);
                chunk = "";
              }
              continue;
            } else {
              chunk = "";
            }
          }
          if (message == "[DONE]") {
            writeResponseChunk(response, {
              uuid,
              sources,
              type: "textResponseChunk",
              textResponse: "",
              close: true,
              error: false,
            });
            resolve(fullText);
          } else {
            let error = null;
            let finishReason = null;
            let token = "";
            try {
              const json = JSON.parse(message);
              error = json?.error || null;
              token = json?.choices?.[0]?.delta?.content;
              finishReason = json?.choices?.[0]?.finish_reason || null;
            } catch {
              continue;
            }
            if (!!error) {
              writeResponseChunk(response, {
                uuid,
                sources: [],
                type: "textResponseChunk",
                textResponse: null,
                close: true,
                error,
              });
              resolve("");
              return;
            }
            if (token) {
              fullText += token;
              writeResponseChunk(response, {
                uuid,
                sources: [],
                type: "textResponseChunk",
                textResponse: token,
                close: false,
                error: false,
              });
            }
            if (finishReason !== null) {
              writeResponseChunk(response, {
                uuid,
                sources,
                type: "textResponseChunk",
                textResponse: "",
                close: true,
                error: false,
              });
              resolve(fullText);
            }
          }
        }
      });
    });
  }
  // If stream is not a regular OpenAI Stream (like if using native model, Ollama, or most LangChain interfaces)
  // we can just iterate the stream content instead.
  if (!stream.hasOwnProperty("data")) {
--- a/server/utils/helpers/index.js
+++ b/server/utils/helpers/index.js
@ -64,6 +64,9 @@ function getLLMProvider(modelPreference = null) {
    case "native":
      const { NativeLLM } = require("../AiProviders/native");
      return new NativeLLM(embedder, modelPreference);
    case "huggingface":
      const { HuggingFaceLLM } = require("../AiProviders/huggingface");
      return new HuggingFaceLLM(embedder, modelPreference);
    default:
      throw new Error("ENV: No LLM_PROVIDER value found in environment!");
  }
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -95,6 +95,7 @@ const KEY_MAPPING = {
    checks: [nonZero],
  },
  // Mistral AI API Settings
  MistralApiKey: {
    envKey: "MISTRAL_API_KEY",
    checks: [isNotEmpty],
@ -109,12 +110,25 @@ const KEY_MAPPING = {
    envKey: "NATIVE_LLM_MODEL_PREF",
    checks: [isDownloadedModel],
  },
  NativeLLMTokenLimit: {
    envKey: "NATIVE_LLM_MODEL_TOKEN_LIMIT",
    checks: [nonZero],
  },
  // Hugging Face LLM Inference Settings
  HuggingFaceLLMEndpoint: {
    envKey: "HUGGING_FACE_LLM_ENDPOINT",
    checks: [isNotEmpty, isValidURL, validHuggingFaceEndpoint],
  },
  HuggingFaceLLMAccessToken: {
    envKey: "HUGGING_FACE_LLM_API_KEY",
    checks: [isNotEmpty],
  },
  HuggingFaceLLMTokenLimit: {
    envKey: "HUGGING_FACE_LLM_TOKEN_LIMIT",
    checks: [nonZero],
  },
  EmbeddingEngine: {
    envKey: "EMBEDDING_ENGINE",
    checks: [supportedEmbeddingModel],
@ -299,6 +313,7 @@ function supportedLLM(input = "") {
    "native",
    "togetherai",
    "mistral",
    "huggingface",
  ].includes(input);
  return validSelection ? null : `${input} is not a valid LLM provider.`;
 }
@ -396,6 +411,12 @@ function validDockerizedUrl(input = "") {
  return null;
 }
 function validHuggingFaceEndpoint(input = "") {
  return input.slice(-6) !== ".cloud"
    ? `Your HF Endpoint should end in ".cloud"`
    : null;
 }
 // If the LLMProvider has changed we need to reset all workspace model preferences to
 // null since the provider<>model name combination will be invalid for whatever the new
 // provider is.