anything-llm/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
Sean Hatfield be3b0b4916
Youtube loader whitespace fix (#2051)
youtube loader whitespace fix
2024-08-06 10:16:17 -07:00

118 lines
3.6 KiB
JavaScript

const { parse } = require("node-html-parser");
const RE_YOUTUBE =
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
const USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
class YoutubeTranscriptError extends Error {
constructor(message) {
super(`[YoutubeTranscript] ${message}`);
}
}
/**
* Class to retrieve transcript if exist
*/
class YoutubeTranscript {
/**
* Fetch transcript from YTB Video
* @param videoId Video url or video identifier
* @param config Object with lang param (eg: en, es, hk, uk) format.
* Will just the grab first caption if it can find one, so no special lang caption support.
*/
static async fetchTranscript(videoId, config = {}) {
const identifier = this.retrieveVideoId(videoId);
const lang = config?.lang ?? "en";
try {
const transcriptUrl = await fetch(
`https://www.youtube.com/watch?v=${identifier}`,
{
headers: {
"User-Agent": USER_AGENT,
},
}
)
.then((res) => res.text())
.then((html) => parse(html))
.then((html) => this.#parseTranscriptEndpoint(html, lang));
if (!transcriptUrl)
throw new Error("Failed to locate a transcript for this video!");
// Result is hopefully some XML.
const transcriptXML = await fetch(transcriptUrl)
.then((res) => res.text())
.then((xml) => parse(xml));
let transcript = "";
const chunks = transcriptXML.getElementsByTagName("text");
for (const chunk of chunks) {
// Add space after each text chunk
transcript += chunk.textContent + " ";
}
// Trim extra whitespace
return transcript.trim().replace(/\s+/g, " ");
} catch (e) {
throw new YoutubeTranscriptError(e);
}
}
static #parseTranscriptEndpoint(document, langCode = null) {
try {
// Get all script tags on document page
const scripts = document.getElementsByTagName("script");
// find the player data script.
const playerScript = scripts.find((script) =>
script.textContent.includes("var ytInitialPlayerResponse = {")
);
const dataString =
playerScript.textContent
?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
?.split("};")?.[0] + // chunk off any code after object closure.
"}"; // add back that curly brace we just cut.
const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
const availableCaptions =
data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
// If languageCode was specified then search for it's code, otherwise get the first.
let captionTrack = availableCaptions?.[0];
if (langCode)
captionTrack =
availableCaptions.find((track) =>
track.languageCode.includes(langCode)
) ?? availableCaptions?.[0];
return captionTrack?.baseUrl;
} catch (e) {
console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
return null;
}
}
/**
* Retrieve video id from url or string
* @param videoId video url or video id
*/
static retrieveVideoId(videoId) {
if (videoId.length === 11) {
return videoId;
}
const matchId = videoId.match(RE_YOUTUBE);
if (matchId && matchId.length) {
return matchId[1];
}
throw new YoutubeTranscriptError(
"Impossible to retrieve Youtube video ID."
);
}
}
module.exports = {
YoutubeTranscript,
YoutubeTranscriptError,
};