Remove YoutubeLoader dependency (#1050)

* WIP data connector redesign

* new UI for data connectors complete

* remove old data connector page/cleanup imports

* cleanup of UI and imports

* Remove Youtube Transcript dep and move in-house

* lang pref default to en

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
Timothy Carambat 2024-04-05 16:33:01 -07:00 committed by GitHub
parent 004b1f8db5
commit 1f8ab0d245
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 257 additions and 34 deletions

View File

@ -34,6 +34,7 @@
"mime": "^3.0.0",
"moment": "^2.29.4",
"multer": "^1.4.5-lts.1",
"node-html-parser": "^6.1.13",
"officeparser": "^4.0.5",
"openai": "^3.2.1",
"pdf-parse": "^1.1.1",
@ -42,7 +43,6 @@
"url-pattern": "^1.0.3",
"uuid": "^9.0.0",
"wavefile": "^11.0.0",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
},
"devDependencies": {

View File

@ -0,0 +1,90 @@
/*
* This is just a custom implementation of the Langchain JS YouTubeLoader class
* as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
* and instead of waiting for patches we can just bring this simple script in-house and at least
* be able to patch it since its so flaky. When we have more connectors we can kill this because
* it will be a pain to maintain over time.
*/
class YoutubeLoader {
#videoId;
#language;
#addVideoInfo;
constructor({ videoId = null, language = null, addVideoInfo = false } = {}) {
if (!videoId) throw new Error("Invalid video id!");
this.#videoId = videoId;
this.#language = language;
this.#addVideoInfo = addVideoInfo;
}
/**
* Extracts the videoId from a YouTube video URL.
* @param url The URL of the YouTube video.
* @returns The videoId of the YouTube video.
*/
static getVideoID(url) {
const match = url.match(
/.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
);
if (match !== null && match[1].length === 11) {
return match[1];
} else {
throw new Error("Failed to get youtube video id from the url");
}
}
/**
* Creates a new instance of the YoutubeLoader class from a YouTube video
* URL.
* @param url The URL of the YouTube video.
* @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.
* @returns A new instance of the YoutubeLoader class.
*/
static createFromUrl(url, config = {}) {
const videoId = YoutubeLoader.getVideoID(url);
return new YoutubeLoader({ ...config, videoId });
}
/**
* Loads the transcript and video metadata from the specified YouTube
* video. It uses the youtube-transcript library to fetch the transcript
* and the youtubei.js library to fetch the video metadata.
* @returns Langchain like doc that is 1 element with PageContent and
*/
async load() {
let transcript;
const metadata = {
source: this.#videoId,
};
try {
const { YoutubeTranscript } = require("./youtube-transcript");
transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, {
lang: this.#language,
});
if (!transcript) {
throw new Error("Transcription not found");
}
if (this.#addVideoInfo) {
const { Innertube } = require("youtubei.js");
const youtube = await Innertube.create();
const info = (await youtube.getBasicInfo(this.#videoId)).basic_info;
metadata.description = info.short_description;
metadata.title = info.title;
metadata.view_count = info.view_count;
metadata.author = info.author;
}
} catch (e) {
throw new Error(
`Failed to get YouTube video transcription: ${e?.message}`
);
}
return [
{
pageContent: transcript,
metadata,
},
];
}
}
module.exports.YoutubeLoader = YoutubeLoader;

View File

@ -0,0 +1,115 @@
const { parse } = require("node-html-parser");
const RE_YOUTUBE =
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
const USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
class YoutubeTranscriptError extends Error {
constructor(message) {
super(`[YoutubeTranscript] ${message}`);
}
}
/**
* Class to retrieve transcript if exist
*/
class YoutubeTranscript {
/**
* Fetch transcript from YTB Video
* @param videoId Video url or video identifier
* @param config Object with lang param (eg: en, es, hk, uk) format.
* Will just the grab first caption if it can find one, so no special lang caption support.
*/
static async fetchTranscript(videoId, config = {}) {
const identifier = this.retrieveVideoId(videoId);
const lang = config?.lang ?? "en";
try {
const transcriptUrl = await fetch(
`https://www.youtube.com/watch?v=${identifier}`,
{
headers: {
"User-Agent": USER_AGENT,
},
}
)
.then((res) => res.text())
.then((html) => parse(html))
.then((html) => this.#parseTranscriptEndpoint(html, lang));
if (!transcriptUrl)
throw new Error("Failed to locate a transcript for this video!");
// Result is hopefully some XML.
const transcriptXML = await fetch(transcriptUrl)
.then((res) => res.text())
.then((xml) => parse(xml));
let transcript = "";
const chunks = transcriptXML.getElementsByTagName("text");
for (const chunk of chunks) {
transcript += chunk.textContent;
}
return transcript;
} catch (e) {
throw new YoutubeTranscriptError(e);
}
}
static #parseTranscriptEndpoint(document, langCode = null) {
try {
// Get all script tags on document page
const scripts = document.getElementsByTagName("script");
// find the player data script.
const playerScript = scripts.find((script) =>
script.textContent.includes("var ytInitialPlayerResponse = {")
);
const dataString =
playerScript.textContent
?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
?.split("};")?.[0] + // chunk off any code after object closure.
"}"; // add back that curly brace we just cut.
const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
const availableCaptions =
data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
// If languageCode was specified then search for it's code, otherwise get the first.
let captionTrack = availableCaptions?.[0];
if (langCode)
captionTrack =
availableCaptions.find((track) =>
track.languageCode.includes(langCode)
) ?? availableCaptions?.[0];
return captionTrack?.baseUrl;
} catch (e) {
console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
return null;
}
}
/**
* Retrieve video id from url or string
* @param videoId video url or video id
*/
static retrieveVideoId(videoId) {
if (videoId.length === 11) {
return videoId;
}
const matchId = videoId.match(RE_YOUTUBE);
if (matchId && matchId.length) {
return matchId[1];
}
throw new YoutubeTranscriptError(
"Impossible to retrieve Youtube video ID."
);
}
}
module.exports = {
YoutubeTranscript,
YoutubeTranscriptError,
};

View File

@ -1,17 +1,17 @@
const { YoutubeLoader } = require("langchain/document_loaders/web/youtube");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const { YoutubeLoader } = require("./YoutubeLoader");
function validYoutubeVideoUrl(link) {
const UrlPattern = require("url-pattern");
const opts = new URL(link);
const url = `${opts.protocol}//${opts.host}${
opts.pathname
}?v=${opts.searchParams.get("v")}`;
const url = `${opts.protocol}//${opts.host}${opts.pathname}${
opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
}`;
const shortPatternMatch = new UrlPattern(
"https\\://(www.)youtu.be/(:videoId)"
@ -56,9 +56,7 @@ async function loadYouTubeTranscript({ url }) {
}
const metadata = docs[0].metadata;
let content = "";
docs.forEach((doc) => (content = content.concat(doc.pageContent)));
const content = docs[0].pageContent;
if (!content.length) {
return {
success: false,

View File

@ -503,6 +503,11 @@ body-parser@^1.20.2:
type-is "~1.6.18"
unpipe "1.0.0"
boolbase@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==
brace-expansion@^1.1.7:
version "1.1.11"
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
@ -589,11 +594,6 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
centra@^2.6.0:
version "2.6.0"
resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a"
integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ==
chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
@ -796,6 +796,22 @@ crypt@0.0.2:
resolved "https://registry.yarnpkg.com/crypt/-/crypt-0.0.2.tgz#88d7ff7ec0dfb86f713dc87bbb42d044d3e6c41b"
integrity sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==
css-select@^5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6"
integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==
dependencies:
boolbase "^1.0.0"
css-what "^6.1.0"
domhandler "^5.0.2"
domutils "^3.0.1"
nth-check "^2.0.1"
css-what@^6.1.0:
version "6.1.0"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
data-uri-to-buffer@^6.0.0:
version "6.0.1"
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-6.0.1.tgz#540bd4c8753a25ee129035aebdedf63b078703c7"
@ -2244,6 +2260,14 @@ node-forge@^1.3.1:
resolved "https://registry.yarnpkg.com/node-forge/-/node-forge-1.3.1.tgz#be8da2af243b2417d5f646a770663a92b7e9ded3"
integrity sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==
node-html-parser@^6.1.13:
version "6.1.13"
resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-6.1.13.tgz#a1df799b83df5c6743fcd92740ba14682083b7e4"
integrity sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg==
dependencies:
css-select "^5.1.0"
he "1.2.0"
nodemailer@6.9.3:
version "6.9.3"
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.3.tgz#e4425b85f05d83c43c5cd81bf84ab968f8ef5cbe"
@ -2294,6 +2318,13 @@ npmlog@^5.0.1:
gauge "^3.0.0"
set-blocking "^2.0.0"
nth-check@^2.0.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d"
integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==
dependencies:
boolbase "^1.0.0"
num-sort@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/num-sort/-/num-sort-2.1.0.tgz#1cbb37aed071329fdf41151258bc011898577a9b"
@ -2522,13 +2553,6 @@ pend@~1.2.0:
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==
phin@^3.5.0:
version "3.7.0"
resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b"
integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg==
dependencies:
centra "^2.6.0"
picomatch@^2.0.4, picomatch@^2.2.1:
version "2.3.1"
resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42"
@ -3421,13 +3445,6 @@ yauzl@^2.10.0, yauzl@^2.4.2:
buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0"
youtube-transcript@^1.0.6:
version "1.0.6"
resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14"
integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA==
dependencies:
phin "^3.5.0"
youtubei.js@^9.1.0:
version "9.1.0"
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e"

View File

@ -74,10 +74,11 @@ export default function SettingsSidebar() {
className={`z-99 fixed top-0 left-0 transition-all duration-500 w-[100vw] h-[100vh]`}
>
<div
className={`${showBgOverlay
className={`${
showBgOverlay
? "transition-all opacity-1"
: "transition-none opacity-0"
} duration-500 fixed top-0 left-0 ${USER_BACKGROUND_COLOR} bg-opacity-75 w-screen h-screen`}
} duration-500 fixed top-0 left-0 ${USER_BACKGROUND_COLOR} bg-opacity-75 w-screen h-screen`}
onClick={() => setShowSidebar(false)}
/>
<div
@ -190,10 +191,11 @@ const Option = ({
transition-all duration-[200ms]
flex flex-grow w-[75%] gap-x-2 py-[6px] px-[12px] rounded-[4px] justify-start items-center
hover:bg-workspace-item-selected-gradient hover:text-white hover:font-medium
${isActive
${
isActive
? "bg-menu-item-selected-gradient font-medium border-outline text-white"
: "hover:bg-menu-item-selected-gradient text-zinc-200"
}
}
`}
>
{React.cloneElement(icon, { weight: isActive ? "fill" : "regular" })}
@ -204,8 +206,9 @@ const Option = ({
</div>
{!!subOptions && (isActive || hasActiveChild) && (
<div
className={`ml-4 ${hasActiveChild ? "" : "border-l-2 border-slate-400"
} rounded-r-lg`}
className={`ml-4 ${
hasActiveChild ? "" : "border-l-2 border-slate-400"
} rounded-r-lg`}
>
{subOptions}
</div>