mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-11 01:10:11 +01:00
Remove YoutubeLoader dependency (#1050)
* WIP data connector redesign * new UI for data connectors complete * remove old data connector page/cleanup imports * cleanup of UI and imports * Remove Youtube Transcript dep and move in-house * lang pref default to en --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
parent
004b1f8db5
commit
1f8ab0d245
@ -34,6 +34,7 @@
|
||||
"mime": "^3.0.0",
|
||||
"moment": "^2.29.4",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"node-html-parser": "^6.1.13",
|
||||
"officeparser": "^4.0.5",
|
||||
"openai": "^3.2.1",
|
||||
"pdf-parse": "^1.1.1",
|
||||
@ -42,7 +43,6 @@
|
||||
"url-pattern": "^1.0.3",
|
||||
"uuid": "^9.0.0",
|
||||
"wavefile": "^11.0.0",
|
||||
"youtube-transcript": "^1.0.6",
|
||||
"youtubei.js": "^9.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
@ -0,0 +1,90 @@
|
||||
/*
|
||||
* This is just a custom implementation of the Langchain JS YouTubeLoader class
|
||||
* as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
|
||||
* and instead of waiting for patches we can just bring this simple script in-house and at least
|
||||
* be able to patch it since its so flaky. When we have more connectors we can kill this because
|
||||
* it will be a pain to maintain over time.
|
||||
*/
|
||||
class YoutubeLoader {
|
||||
#videoId;
|
||||
#language;
|
||||
#addVideoInfo;
|
||||
|
||||
constructor({ videoId = null, language = null, addVideoInfo = false } = {}) {
|
||||
if (!videoId) throw new Error("Invalid video id!");
|
||||
this.#videoId = videoId;
|
||||
this.#language = language;
|
||||
this.#addVideoInfo = addVideoInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the videoId from a YouTube video URL.
|
||||
* @param url The URL of the YouTube video.
|
||||
* @returns The videoId of the YouTube video.
|
||||
*/
|
||||
static getVideoID(url) {
|
||||
const match = url.match(
|
||||
/.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
|
||||
);
|
||||
if (match !== null && match[1].length === 11) {
|
||||
return match[1];
|
||||
} else {
|
||||
throw new Error("Failed to get youtube video id from the url");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of the YoutubeLoader class from a YouTube video
|
||||
* URL.
|
||||
* @param url The URL of the YouTube video.
|
||||
* @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.
|
||||
* @returns A new instance of the YoutubeLoader class.
|
||||
*/
|
||||
static createFromUrl(url, config = {}) {
|
||||
const videoId = YoutubeLoader.getVideoID(url);
|
||||
return new YoutubeLoader({ ...config, videoId });
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the transcript and video metadata from the specified YouTube
|
||||
* video. It uses the youtube-transcript library to fetch the transcript
|
||||
* and the youtubei.js library to fetch the video metadata.
|
||||
* @returns Langchain like doc that is 1 element with PageContent and
|
||||
*/
|
||||
async load() {
|
||||
let transcript;
|
||||
const metadata = {
|
||||
source: this.#videoId,
|
||||
};
|
||||
try {
|
||||
const { YoutubeTranscript } = require("./youtube-transcript");
|
||||
transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, {
|
||||
lang: this.#language,
|
||||
});
|
||||
if (!transcript) {
|
||||
throw new Error("Transcription not found");
|
||||
}
|
||||
if (this.#addVideoInfo) {
|
||||
const { Innertube } = require("youtubei.js");
|
||||
const youtube = await Innertube.create();
|
||||
const info = (await youtube.getBasicInfo(this.#videoId)).basic_info;
|
||||
metadata.description = info.short_description;
|
||||
metadata.title = info.title;
|
||||
metadata.view_count = info.view_count;
|
||||
metadata.author = info.author;
|
||||
}
|
||||
} catch (e) {
|
||||
throw new Error(
|
||||
`Failed to get YouTube video transcription: ${e?.message}`
|
||||
);
|
||||
}
|
||||
return [
|
||||
{
|
||||
pageContent: transcript,
|
||||
metadata,
|
||||
},
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.YoutubeLoader = YoutubeLoader;
|
@ -0,0 +1,115 @@
|
||||
const { parse } = require("node-html-parser");
|
||||
const RE_YOUTUBE =
|
||||
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
|
||||
const USER_AGENT =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
|
||||
|
||||
class YoutubeTranscriptError extends Error {
|
||||
constructor(message) {
|
||||
super(`[YoutubeTranscript] ${message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Class to retrieve transcript if exist
|
||||
*/
|
||||
class YoutubeTranscript {
|
||||
/**
|
||||
* Fetch transcript from YTB Video
|
||||
* @param videoId Video url or video identifier
|
||||
* @param config Object with lang param (eg: en, es, hk, uk) format.
|
||||
* Will just the grab first caption if it can find one, so no special lang caption support.
|
||||
*/
|
||||
static async fetchTranscript(videoId, config = {}) {
|
||||
const identifier = this.retrieveVideoId(videoId);
|
||||
const lang = config?.lang ?? "en";
|
||||
try {
|
||||
const transcriptUrl = await fetch(
|
||||
`https://www.youtube.com/watch?v=${identifier}`,
|
||||
{
|
||||
headers: {
|
||||
"User-Agent": USER_AGENT,
|
||||
},
|
||||
}
|
||||
)
|
||||
.then((res) => res.text())
|
||||
.then((html) => parse(html))
|
||||
.then((html) => this.#parseTranscriptEndpoint(html, lang));
|
||||
|
||||
if (!transcriptUrl)
|
||||
throw new Error("Failed to locate a transcript for this video!");
|
||||
|
||||
// Result is hopefully some XML.
|
||||
const transcriptXML = await fetch(transcriptUrl)
|
||||
.then((res) => res.text())
|
||||
.then((xml) => parse(xml));
|
||||
|
||||
let transcript = "";
|
||||
const chunks = transcriptXML.getElementsByTagName("text");
|
||||
for (const chunk of chunks) {
|
||||
transcript += chunk.textContent;
|
||||
}
|
||||
|
||||
return transcript;
|
||||
} catch (e) {
|
||||
throw new YoutubeTranscriptError(e);
|
||||
}
|
||||
}
|
||||
|
||||
static #parseTranscriptEndpoint(document, langCode = null) {
|
||||
try {
|
||||
// Get all script tags on document page
|
||||
const scripts = document.getElementsByTagName("script");
|
||||
|
||||
// find the player data script.
|
||||
const playerScript = scripts.find((script) =>
|
||||
script.textContent.includes("var ytInitialPlayerResponse = {")
|
||||
);
|
||||
|
||||
const dataString =
|
||||
playerScript.textContent
|
||||
?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
|
||||
?.split("};")?.[0] + // chunk off any code after object closure.
|
||||
"}"; // add back that curly brace we just cut.
|
||||
|
||||
const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
|
||||
const availableCaptions =
|
||||
data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
|
||||
|
||||
// If languageCode was specified then search for it's code, otherwise get the first.
|
||||
let captionTrack = availableCaptions?.[0];
|
||||
if (langCode)
|
||||
captionTrack =
|
||||
availableCaptions.find((track) =>
|
||||
track.languageCode.includes(langCode)
|
||||
) ?? availableCaptions?.[0];
|
||||
|
||||
return captionTrack?.baseUrl;
|
||||
} catch (e) {
|
||||
console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve video id from url or string
|
||||
* @param videoId video url or video id
|
||||
*/
|
||||
static retrieveVideoId(videoId) {
|
||||
if (videoId.length === 11) {
|
||||
return videoId;
|
||||
}
|
||||
const matchId = videoId.match(RE_YOUTUBE);
|
||||
if (matchId && matchId.length) {
|
||||
return matchId[1];
|
||||
}
|
||||
throw new YoutubeTranscriptError(
|
||||
"Impossible to retrieve Youtube video ID."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
YoutubeTranscript,
|
||||
YoutubeTranscriptError,
|
||||
};
|
@ -1,17 +1,17 @@
|
||||
const { YoutubeLoader } = require("langchain/document_loaders/web/youtube");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const { default: slugify } = require("slugify");
|
||||
const { v4 } = require("uuid");
|
||||
const { writeToServerDocuments } = require("../../files");
|
||||
const { tokenizeString } = require("../../tokenizer");
|
||||
const { YoutubeLoader } = require("./YoutubeLoader");
|
||||
|
||||
function validYoutubeVideoUrl(link) {
|
||||
const UrlPattern = require("url-pattern");
|
||||
const opts = new URL(link);
|
||||
const url = `${opts.protocol}//${opts.host}${
|
||||
opts.pathname
|
||||
}?v=${opts.searchParams.get("v")}`;
|
||||
const url = `${opts.protocol}//${opts.host}${opts.pathname}${
|
||||
opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
|
||||
}`;
|
||||
|
||||
const shortPatternMatch = new UrlPattern(
|
||||
"https\\://(www.)youtu.be/(:videoId)"
|
||||
@ -56,9 +56,7 @@ async function loadYouTubeTranscript({ url }) {
|
||||
}
|
||||
|
||||
const metadata = docs[0].metadata;
|
||||
let content = "";
|
||||
docs.forEach((doc) => (content = content.concat(doc.pageContent)));
|
||||
|
||||
const content = docs[0].pageContent;
|
||||
if (!content.length) {
|
||||
return {
|
||||
success: false,
|
||||
|
@ -503,6 +503,11 @@ body-parser@^1.20.2:
|
||||
type-is "~1.6.18"
|
||||
unpipe "1.0.0"
|
||||
|
||||
boolbase@^1.0.0:
|
||||
version "1.0.0"
|
||||
resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
|
||||
integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==
|
||||
|
||||
brace-expansion@^1.1.7:
|
||||
version "1.1.11"
|
||||
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
|
||||
@ -589,11 +594,6 @@ camelcase@6:
|
||||
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
|
||||
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
|
||||
|
||||
centra@^2.6.0:
|
||||
version "2.6.0"
|
||||
resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a"
|
||||
integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ==
|
||||
|
||||
chalk@^2.4.2:
|
||||
version "2.4.2"
|
||||
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
|
||||
@ -796,6 +796,22 @@ crypt@0.0.2:
|
||||
resolved "https://registry.yarnpkg.com/crypt/-/crypt-0.0.2.tgz#88d7ff7ec0dfb86f713dc87bbb42d044d3e6c41b"
|
||||
integrity sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==
|
||||
|
||||
css-select@^5.1.0:
|
||||
version "5.1.0"
|
||||
resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6"
|
||||
integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==
|
||||
dependencies:
|
||||
boolbase "^1.0.0"
|
||||
css-what "^6.1.0"
|
||||
domhandler "^5.0.2"
|
||||
domutils "^3.0.1"
|
||||
nth-check "^2.0.1"
|
||||
|
||||
css-what@^6.1.0:
|
||||
version "6.1.0"
|
||||
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
|
||||
integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
|
||||
|
||||
data-uri-to-buffer@^6.0.0:
|
||||
version "6.0.1"
|
||||
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-6.0.1.tgz#540bd4c8753a25ee129035aebdedf63b078703c7"
|
||||
@ -2244,6 +2260,14 @@ node-forge@^1.3.1:
|
||||
resolved "https://registry.yarnpkg.com/node-forge/-/node-forge-1.3.1.tgz#be8da2af243b2417d5f646a770663a92b7e9ded3"
|
||||
integrity sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==
|
||||
|
||||
node-html-parser@^6.1.13:
|
||||
version "6.1.13"
|
||||
resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-6.1.13.tgz#a1df799b83df5c6743fcd92740ba14682083b7e4"
|
||||
integrity sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg==
|
||||
dependencies:
|
||||
css-select "^5.1.0"
|
||||
he "1.2.0"
|
||||
|
||||
nodemailer@6.9.3:
|
||||
version "6.9.3"
|
||||
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.3.tgz#e4425b85f05d83c43c5cd81bf84ab968f8ef5cbe"
|
||||
@ -2294,6 +2318,13 @@ npmlog@^5.0.1:
|
||||
gauge "^3.0.0"
|
||||
set-blocking "^2.0.0"
|
||||
|
||||
nth-check@^2.0.1:
|
||||
version "2.1.1"
|
||||
resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d"
|
||||
integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==
|
||||
dependencies:
|
||||
boolbase "^1.0.0"
|
||||
|
||||
num-sort@^2.0.0:
|
||||
version "2.1.0"
|
||||
resolved "https://registry.yarnpkg.com/num-sort/-/num-sort-2.1.0.tgz#1cbb37aed071329fdf41151258bc011898577a9b"
|
||||
@ -2522,13 +2553,6 @@ pend@~1.2.0:
|
||||
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
|
||||
integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==
|
||||
|
||||
phin@^3.5.0:
|
||||
version "3.7.0"
|
||||
resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b"
|
||||
integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg==
|
||||
dependencies:
|
||||
centra "^2.6.0"
|
||||
|
||||
picomatch@^2.0.4, picomatch@^2.2.1:
|
||||
version "2.3.1"
|
||||
resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42"
|
||||
@ -3421,13 +3445,6 @@ yauzl@^2.10.0, yauzl@^2.4.2:
|
||||
buffer-crc32 "~0.2.3"
|
||||
fd-slicer "~1.1.0"
|
||||
|
||||
youtube-transcript@^1.0.6:
|
||||
version "1.0.6"
|
||||
resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14"
|
||||
integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA==
|
||||
dependencies:
|
||||
phin "^3.5.0"
|
||||
|
||||
youtubei.js@^9.1.0:
|
||||
version "9.1.0"
|
||||
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e"
|
||||
|
@ -74,10 +74,11 @@ export default function SettingsSidebar() {
|
||||
className={`z-99 fixed top-0 left-0 transition-all duration-500 w-[100vw] h-[100vh]`}
|
||||
>
|
||||
<div
|
||||
className={`${showBgOverlay
|
||||
className={`${
|
||||
showBgOverlay
|
||||
? "transition-all opacity-1"
|
||||
: "transition-none opacity-0"
|
||||
} duration-500 fixed top-0 left-0 ${USER_BACKGROUND_COLOR} bg-opacity-75 w-screen h-screen`}
|
||||
} duration-500 fixed top-0 left-0 ${USER_BACKGROUND_COLOR} bg-opacity-75 w-screen h-screen`}
|
||||
onClick={() => setShowSidebar(false)}
|
||||
/>
|
||||
<div
|
||||
@ -190,10 +191,11 @@ const Option = ({
|
||||
transition-all duration-[200ms]
|
||||
flex flex-grow w-[75%] gap-x-2 py-[6px] px-[12px] rounded-[4px] justify-start items-center
|
||||
hover:bg-workspace-item-selected-gradient hover:text-white hover:font-medium
|
||||
${isActive
|
||||
${
|
||||
isActive
|
||||
? "bg-menu-item-selected-gradient font-medium border-outline text-white"
|
||||
: "hover:bg-menu-item-selected-gradient text-zinc-200"
|
||||
}
|
||||
}
|
||||
`}
|
||||
>
|
||||
{React.cloneElement(icon, { weight: isActive ? "fill" : "regular" })}
|
||||
@ -204,8 +206,9 @@ const Option = ({
|
||||
</div>
|
||||
{!!subOptions && (isActive || hasActiveChild) && (
|
||||
<div
|
||||
className={`ml-4 ${hasActiveChild ? "" : "border-l-2 border-slate-400"
|
||||
} rounded-r-lg`}
|
||||
className={`ml-4 ${
|
||||
hasActiveChild ? "" : "border-l-2 border-slate-400"
|
||||
} rounded-r-lg`}
|
||||
>
|
||||
{subOptions}
|
||||
</div>
|
||||
|
Loading…
Reference in New Issue
Block a user