anything-llm/collector/utils/WhisperProviders/localWhisper.js

const fs = require("fs");
const path = require("path");
const { v4 } = require("uuid");

class LocalWhisper {
  constructor() {
    // Model Card: https://huggingface.co/Xenova/whisper-small
    this.model = "Xenova/whisper-small";
    this.cacheDir = path.resolve(
      process.env.STORAGE_DIR
        ? path.resolve(process.env.STORAGE_DIR, `models`)
        : path.resolve(__dirname, `../../../server/storage/models`)
    );

    this.modelPath = path.resolve(this.cacheDir, "Xenova", "whisper-small");

    // Make directory when it does not exist in existing installations
    if (!fs.existsSync(this.cacheDir))
      fs.mkdirSync(this.cacheDir, { recursive: true });

    this.#log("Initialized.");
  }

  #log(text, ...args) {
    console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args);
  }

  async #convertToWavAudioData(sourcePath) {
    try {
      let buffer;
      const wavefile = require("wavefile");
      const ffmpeg = require("fluent-ffmpeg");
      const outFolder = path.resolve(__dirname, `../../storage/tmp`);
      if (!fs.existsSync(outFolder))
        fs.mkdirSync(outFolder, { recursive: true });

      const fileExtension = path.extname(sourcePath).toLowerCase();
      if (fileExtension !== ".wav") {
        this.#log(
          `File conversion required! ${fileExtension} file detected - converting to .wav`
        );
        const outputFile = path.resolve(outFolder, `${v4()}.wav`);
        const convert = new Promise((resolve) => {
          ffmpeg(sourcePath)
            .toFormat("wav")
            .on("error", (error) => {
              this.#log(`Conversion Error! ${error.message}`);
              resolve(false);
            })
            .on("progress", (progress) =>
              this.#log(
                `Conversion Processing! ${progress.targetSize}KB converted`
              )
            )
            .on("end", () => {
              this.#log(`Conversion Complete! File converted to .wav!`);
              resolve(true);
            })
            .save(outputFile);
        });
        const success = await convert;
        if (!success)
          throw new Error(
            "[Conversion Failed]: Could not convert file to .wav format!"
          );

        const chunks = [];
        const stream = fs.createReadStream(outputFile);
        for await (let chunk of stream) chunks.push(chunk);
        buffer = Buffer.concat(chunks);
        fs.rmSync(outputFile);
      } else {
        const chunks = [];
        const stream = fs.createReadStream(sourcePath);
        for await (let chunk of stream) chunks.push(chunk);
        buffer = Buffer.concat(chunks);
      }

      const wavFile = new wavefile.WaveFile(buffer);
      wavFile.toBitDepth("32f");
      wavFile.toSampleRate(16000);

      let audioData = wavFile.getSamples();
      if (Array.isArray(audioData)) {
        if (audioData.length > 1) {
          const SCALING_FACTOR = Math.sqrt(2);

          // Merge channels into first channel to save memory
          for (let i = 0; i < audioData[0].length; ++i) {
            audioData[0][i] =
              (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
          }
        }
        audioData = audioData[0];
      }

      return audioData;
    } catch (error) {
      console.error(`convertToWavAudioData`, error);
      return null;
    }
  }

  async client() {
    if (!fs.existsSync(this.modelPath)) {
      this.#log(
        `The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)`
      );
    }

    try {
      // Convert ESM to CommonJS via import so we can load this library.
      const pipeline = (...args) =>
        import("@xenova/transformers").then(({ pipeline }) =>
          pipeline(...args)
        );
      return await pipeline("automatic-speech-recognition", this.model, {
        cache_dir: this.cacheDir,
        ...(!fs.existsSync(this.modelPath)
          ? {
              // Show download progress if we need to download any files
              progress_callback: (data) => {
                if (!data.hasOwnProperty("progress")) return;
                console.log(
                  `\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${
                    data.file
                  } ${~~data?.progress}%`
                );
              },
            }
          : {}),
      });
    } catch (error) {
      this.#log("Failed to load the native whisper model:", error);
      throw error;
    }
  }

  async processFile(fullFilePath, filename) {
    try {
      const transcriberPromise = new Promise((resolve) =>
        this.client().then((client) => resolve(client))
      );
      const audioDataPromise = new Promise((resolve) =>
        this.#convertToWavAudioData(fullFilePath).then((audioData) =>
          resolve(audioData)
        )
      );
      const [audioData, transcriber] = await Promise.all([
        audioDataPromise,
        transcriberPromise,
      ]);

      if (!audioData) {
        this.#log(`Failed to parse content from ${filename}.`);
        return {
          content: null,
          error: `Failed to parse content from ${filename}.`,
        };
      }

      this.#log(`Transcribing audio data to text...`);
      const { text } = await transcriber(audioData, {
        chunk_length_s: 30,
        stride_length_s: 5,
      });

      return { content: text, error: null };
    } catch (error) {
      return { content: null, error: error.message };
    }
  }
}

module.exports = {
  LocalWhisper,
};
feat: Embed on-instance Whisper model for audio/mp4 transcribing (#449) * feat: Embed on-instance Whisper model for audio/mp4 transcribing resolves #329 * additional logging * add placeholder for tmp folder in collector storage Add cleanup of hotdir and tmp on collector boot to prevent hanging files split loading of model and file conversion into concurrency * update README * update model size * update supported filetypes 2023-12-15 20:20:13 +01:00			`const fs = require("fs");`
Support external transcription providers (#909) * Support External Transcription providers * patch files * update docs * fix return data 2024-03-14 23:43:26 +01:00			`const path = require("path");`
			`const { v4 } = require("uuid");`
feat: Embed on-instance Whisper model for audio/mp4 transcribing (#449) * feat: Embed on-instance Whisper model for audio/mp4 transcribing resolves #329 * additional logging * add placeholder for tmp folder in collector storage Add cleanup of hotdir and tmp on collector boot to prevent hanging files split loading of model and file conversion into concurrency * update README * update model size * update supported filetypes 2023-12-15 20:20:13 +01:00
			`class LocalWhisper {`
			`constructor() {`
			`// Model Card: https://huggingface.co/Xenova/whisper-small`
			`this.model = "Xenova/whisper-small";`
			`this.cacheDir = path.resolve(`
			`process.env.STORAGE_DIR`
			? path.resolve(process.env.STORAGE_DIR, `models`)
			: path.resolve(__dirname, `../../../server/storage/models`)
			`);`

			`this.modelPath = path.resolve(this.cacheDir, "Xenova", "whisper-small");`

			`// Make directory when it does not exist in existing installations`
			`if (!fs.existsSync(this.cacheDir))`
			`fs.mkdirSync(this.cacheDir, { recursive: true });`
Support external transcription providers (#909) * Support External Transcription providers * patch files * update docs * fix return data 2024-03-14 23:43:26 +01:00
			`this.#log("Initialized.");`
			`}`

			`#log(text, ...args) {`
			console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args);
			`}`

			`async #convertToWavAudioData(sourcePath) {`
			`try {`
			`let buffer;`
			`const wavefile = require("wavefile");`
			`const ffmpeg = require("fluent-ffmpeg");`
			const outFolder = path.resolve(__dirname, `../../storage/tmp`);
			`if (!fs.existsSync(outFolder))`
			`fs.mkdirSync(outFolder, { recursive: true });`

			`const fileExtension = path.extname(sourcePath).toLowerCase();`
			`if (fileExtension !== ".wav") {`
			`this.#log(`
			`File conversion required! ${fileExtension} file detected - converting to .wav`
			`);`
			const outputFile = path.resolve(outFolder, `${v4()}.wav`);
			`const convert = new Promise((resolve) => {`
			`ffmpeg(sourcePath)`
			`.toFormat("wav")`
			`.on("error", (error) => {`
			this.#log(`Conversion Error! ${error.message}`);
			`resolve(false);`
			`})`
			`.on("progress", (progress) =>`
			`this.#log(`
			`Conversion Processing! ${progress.targetSize}KB converted`
			`)`
			`)`
			`.on("end", () => {`
			this.#log(`Conversion Complete! File converted to .wav!`);
			`resolve(true);`
			`})`
			`.save(outputFile);`
			`});`
			`const success = await convert;`
			`if (!success)`
			`throw new Error(`
			`"[Conversion Failed]: Could not convert file to .wav format!"`
			`);`

			`const chunks = [];`
			`const stream = fs.createReadStream(outputFile);`
			`for await (let chunk of stream) chunks.push(chunk);`
			`buffer = Buffer.concat(chunks);`
			`fs.rmSync(outputFile);`
			`} else {`
			`const chunks = [];`
			`const stream = fs.createReadStream(sourcePath);`
			`for await (let chunk of stream) chunks.push(chunk);`
			`buffer = Buffer.concat(chunks);`
			`}`

			`const wavFile = new wavefile.WaveFile(buffer);`
			`wavFile.toBitDepth("32f");`
			`wavFile.toSampleRate(16000);`

			`let audioData = wavFile.getSamples();`
			`if (Array.isArray(audioData)) {`
			`if (audioData.length > 1) {`
			`const SCALING_FACTOR = Math.sqrt(2);`

			`// Merge channels into first channel to save memory`
			`for (let i = 0; i < audioData[0].length; ++i) {`
			`audioData[0][i] =`
			`(SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;`
			`}`
			`}`
			`audioData = audioData[0];`
			`}`

			`return audioData;`
			`} catch (error) {`
			console.error(`convertToWavAudioData`, error);
			`return null;`
			`}`
feat: Embed on-instance Whisper model for audio/mp4 transcribing (#449) * feat: Embed on-instance Whisper model for audio/mp4 transcribing resolves #329 * additional logging * add placeholder for tmp folder in collector storage Add cleanup of hotdir and tmp on collector boot to prevent hanging files split loading of model and file conversion into concurrency * update README * update model size * update supported filetypes 2023-12-15 20:20:13 +01:00			`}`

			`async client() {`
			`if (!fs.existsSync(this.modelPath)) {`
Support external transcription providers (#909) * Support External Transcription providers * patch files * update docs * fix return data 2024-03-14 23:43:26 +01:00			`this.#log(`
			`The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)`
feat: Embed on-instance Whisper model for audio/mp4 transcribing (#449) * feat: Embed on-instance Whisper model for audio/mp4 transcribing resolves #329 * additional logging * add placeholder for tmp folder in collector storage Add cleanup of hotdir and tmp on collector boot to prevent hanging files split loading of model and file conversion into concurrency * update README * update model size * update supported filetypes 2023-12-15 20:20:13 +01:00			`);`
			`}`

			`try {`
			`// Convert ESM to CommonJS via import so we can load this library.`
			`const pipeline = (...args) =>`
			`import("@xenova/transformers").then(({ pipeline }) =>`
			`pipeline(...args)`
			`);`
			`return await pipeline("automatic-speech-recognition", this.model, {`
			`cache_dir: this.cacheDir,`
			`...(!fs.existsSync(this.modelPath)`
			`? {`
			`// Show download progress if we need to download any files`
			`progress_callback: (data) => {`
			`if (!data.hasOwnProperty("progress")) return;`
			`console.log(`
			`\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${
			`data.file`
			} ${~~data?.progress}%`
			`);`
			`},`
			`}`
			`: {}),`
			`});`
			`} catch (error) {`
Support external transcription providers (#909) * Support External Transcription providers * patch files * update docs * fix return data 2024-03-14 23:43:26 +01:00			`this.#log("Failed to load the native whisper model:", error);`
feat: Embed on-instance Whisper model for audio/mp4 transcribing (#449) * feat: Embed on-instance Whisper model for audio/mp4 transcribing resolves #329 * additional logging * add placeholder for tmp folder in collector storage Add cleanup of hotdir and tmp on collector boot to prevent hanging files split loading of model and file conversion into concurrency * update README * update model size * update supported filetypes 2023-12-15 20:20:13 +01:00			`throw error;`
			`}`
			`}`
Support external transcription providers (#909) * Support External Transcription providers * patch files * update docs * fix return data 2024-03-14 23:43:26 +01:00
			`async processFile(fullFilePath, filename) {`
			`try {`
			`const transcriberPromise = new Promise((resolve) =>`
			`this.client().then((client) => resolve(client))`
			`);`
			`const audioDataPromise = new Promise((resolve) =>`
			`this.#convertToWavAudioData(fullFilePath).then((audioData) =>`
			`resolve(audioData)`
			`)`
			`);`
			`const [audioData, transcriber] = await Promise.all([`
			`audioDataPromise,`
			`transcriberPromise,`
			`]);`

			`if (!audioData) {`
			this.#log(`Failed to parse content from ${filename}.`);
			`return {`
			`content: null,`
			error: `Failed to parse content from ${filename}.`,
			`};`
			`}`

			this.#log(`Transcribing audio data to text...`);
			`const { text } = await transcriber(audioData, {`
			`chunk_length_s: 30,`
			`stride_length_s: 5,`
			`});`

			`return { content: text, error: null };`
			`} catch (error) {`
			`return { content: null, error: error.message };`
			`}`
			`}`
feat: Embed on-instance Whisper model for audio/mp4 transcribing (#449) * feat: Embed on-instance Whisper model for audio/mp4 transcribing resolves #329 * additional logging * add placeholder for tmp folder in collector storage Add cleanup of hotdir and tmp on collector boot to prevent hanging files split loading of model and file conversion into concurrency * update README * update model size * update supported filetypes 2023-12-15 20:20:13 +01:00			`}`

			`module.exports = {`
			`LocalWhisper,`
			`};`