anything-llm/collector/utils/WhisperProviders/localWhisper.js
Timothy Carambat 0ada882991
Support external transcription providers (#909)
* Support External Transcription providers

* patch files

* update docs

* fix return data
2024-03-14 15:43:26 -07:00

178 lines
5.4 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const { v4 } = require("uuid");
class LocalWhisper {
constructor() {
// Model Card: https://huggingface.co/Xenova/whisper-small
this.model = "Xenova/whisper-small";
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`)
: path.resolve(__dirname, `../../../server/storage/models`)
);
this.modelPath = path.resolve(this.cacheDir, "Xenova", "whisper-small");
// Make directory when it does not exist in existing installations
if (!fs.existsSync(this.cacheDir))
fs.mkdirSync(this.cacheDir, { recursive: true });
this.#log("Initialized.");
}
#log(text, ...args) {
console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args);
}
async #convertToWavAudioData(sourcePath) {
try {
let buffer;
const wavefile = require("wavefile");
const ffmpeg = require("fluent-ffmpeg");
const outFolder = path.resolve(__dirname, `../../storage/tmp`);
if (!fs.existsSync(outFolder))
fs.mkdirSync(outFolder, { recursive: true });
const fileExtension = path.extname(sourcePath).toLowerCase();
if (fileExtension !== ".wav") {
this.#log(
`File conversion required! ${fileExtension} file detected - converting to .wav`
);
const outputFile = path.resolve(outFolder, `${v4()}.wav`);
const convert = new Promise((resolve) => {
ffmpeg(sourcePath)
.toFormat("wav")
.on("error", (error) => {
this.#log(`Conversion Error! ${error.message}`);
resolve(false);
})
.on("progress", (progress) =>
this.#log(
`Conversion Processing! ${progress.targetSize}KB converted`
)
)
.on("end", () => {
this.#log(`Conversion Complete! File converted to .wav!`);
resolve(true);
})
.save(outputFile);
});
const success = await convert;
if (!success)
throw new Error(
"[Conversion Failed]: Could not convert file to .wav format!"
);
const chunks = [];
const stream = fs.createReadStream(outputFile);
for await (let chunk of stream) chunks.push(chunk);
buffer = Buffer.concat(chunks);
fs.rmSync(outputFile);
} else {
const chunks = [];
const stream = fs.createReadStream(sourcePath);
for await (let chunk of stream) chunks.push(chunk);
buffer = Buffer.concat(chunks);
}
const wavFile = new wavefile.WaveFile(buffer);
wavFile.toBitDepth("32f");
wavFile.toSampleRate(16000);
let audioData = wavFile.getSamples();
if (Array.isArray(audioData)) {
if (audioData.length > 1) {
const SCALING_FACTOR = Math.sqrt(2);
// Merge channels into first channel to save memory
for (let i = 0; i < audioData[0].length; ++i) {
audioData[0][i] =
(SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
}
}
audioData = audioData[0];
}
return audioData;
} catch (error) {
console.error(`convertToWavAudioData`, error);
return null;
}
}
async client() {
if (!fs.existsSync(this.modelPath)) {
this.#log(
`The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)`
);
}
try {
// Convert ESM to CommonJS via import so we can load this library.
const pipeline = (...args) =>
import("@xenova/transformers").then(({ pipeline }) =>
pipeline(...args)
);
return await pipeline("automatic-speech-recognition", this.model, {
cache_dir: this.cacheDir,
...(!fs.existsSync(this.modelPath)
? {
// Show download progress if we need to download any files
progress_callback: (data) => {
if (!data.hasOwnProperty("progress")) return;
console.log(
`\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${
data.file
} ${~~data?.progress}%`
);
},
}
: {}),
});
} catch (error) {
this.#log("Failed to load the native whisper model:", error);
throw error;
}
}
async processFile(fullFilePath, filename) {
try {
const transcriberPromise = new Promise((resolve) =>
this.client().then((client) => resolve(client))
);
const audioDataPromise = new Promise((resolve) =>
this.#convertToWavAudioData(fullFilePath).then((audioData) =>
resolve(audioData)
)
);
const [audioData, transcriber] = await Promise.all([
audioDataPromise,
transcriberPromise,
]);
if (!audioData) {
this.#log(`Failed to parse content from ${filename}.`);
return {
content: null,
error: `Failed to parse content from ${filename}.`,
};
}
this.#log(`Transcribing audio data to text...`);
const { text } = await transcriber(audioData, {
chunk_length_s: 30,
stride_length_s: 5,
});
return { content: text, error: null };
} catch (error) {
return { content: null, error: error.message };
}
}
}
module.exports = {
LocalWhisper,
};