diff --git a/collector/package.json b/collector/package.json index 7c82014a..4a5a99ff 100644 --- a/collector/package.json +++ b/collector/package.json @@ -34,6 +34,7 @@ "mime": "^3.0.0", "moment": "^2.29.4", "multer": "^1.4.5-lts.1", + "node-html-parser": "^6.1.13", "officeparser": "^4.0.5", "openai": "^3.2.1", "pdf-parse": "^1.1.1", @@ -42,11 +43,10 @@ "url-pattern": "^1.0.3", "uuid": "^9.0.0", "wavefile": "^11.0.0", - "youtube-transcript": "^1.0.6", "youtubei.js": "^9.1.0" }, "devDependencies": { "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} +} \ No newline at end of file diff --git a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js new file mode 100644 index 00000000..aac94eb4 --- /dev/null +++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js @@ -0,0 +1,90 @@ +/* + * This is just a custom implementation of the Langchain JS YouTubeLoader class + * as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up + * and instead of waiting for patches we can just bring this simple script in-house and at least + * be able to patch it since its so flaky. When we have more connectors we can kill this because + * it will be a pain to maintain over time. + */ +class YoutubeLoader { + #videoId; + #language; + #addVideoInfo; + + constructor({ videoId = null, language = null, addVideoInfo = false } = {}) { + if (!videoId) throw new Error("Invalid video id!"); + this.#videoId = videoId; + this.#language = language; + this.#addVideoInfo = addVideoInfo; + } + + /** + * Extracts the videoId from a YouTube video URL. + * @param url The URL of the YouTube video. + * @returns The videoId of the YouTube video. + */ + static getVideoID(url) { + const match = url.match( + /.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/ + ); + if (match !== null && match[1].length === 11) { + return match[1]; + } else { + throw new Error("Failed to get youtube video id from the url"); + } + } + + /** + * Creates a new instance of the YoutubeLoader class from a YouTube video + * URL. + * @param url The URL of the YouTube video. + * @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId. + * @returns A new instance of the YoutubeLoader class. + */ + static createFromUrl(url, config = {}) { + const videoId = YoutubeLoader.getVideoID(url); + return new YoutubeLoader({ ...config, videoId }); + } + + /** + * Loads the transcript and video metadata from the specified YouTube + * video. It uses the youtube-transcript library to fetch the transcript + * and the youtubei.js library to fetch the video metadata. + * @returns Langchain like doc that is 1 element with PageContent and + */ + async load() { + let transcript; + const metadata = { + source: this.#videoId, + }; + try { + const { YoutubeTranscript } = require("./youtube-transcript"); + transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, { + lang: this.#language, + }); + if (!transcript) { + throw new Error("Transcription not found"); + } + if (this.#addVideoInfo) { + const { Innertube } = require("youtubei.js"); + const youtube = await Innertube.create(); + const info = (await youtube.getBasicInfo(this.#videoId)).basic_info; + metadata.description = info.short_description; + metadata.title = info.title; + metadata.view_count = info.view_count; + metadata.author = info.author; + } + } catch (e) { + throw new Error( + `Failed to get YouTube video transcription: ${e?.message}` + ); + } + return [ + { + pageContent: transcript, + metadata, + }, + ]; + } +} + +module.exports.YoutubeLoader = YoutubeLoader; diff --git a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js new file mode 100644 index 00000000..c81c0ec5 --- /dev/null +++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js @@ -0,0 +1,115 @@ +const { parse } = require("node-html-parser"); +const RE_YOUTUBE = + /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i; +const USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)"; + +class YoutubeTranscriptError extends Error { + constructor(message) { + super(`[YoutubeTranscript] ${message}`); + } +} + +/** + * Class to retrieve transcript if exist + */ +class YoutubeTranscript { + /** + * Fetch transcript from YTB Video + * @param videoId Video url or video identifier + * @param config Object with lang param (eg: en, es, hk, uk) format. + * Will just the grab first caption if it can find one, so no special lang caption support. + */ + static async fetchTranscript(videoId, config = {}) { + const identifier = this.retrieveVideoId(videoId); + const lang = config?.lang ?? "en"; + try { + const transcriptUrl = await fetch( + `https://www.youtube.com/watch?v=${identifier}`, + { + headers: { + "User-Agent": USER_AGENT, + }, + } + ) + .then((res) => res.text()) + .then((html) => parse(html)) + .then((html) => this.#parseTranscriptEndpoint(html, lang)); + + if (!transcriptUrl) + throw new Error("Failed to locate a transcript for this video!"); + + // Result is hopefully some XML. + const transcriptXML = await fetch(transcriptUrl) + .then((res) => res.text()) + .then((xml) => parse(xml)); + + let transcript = ""; + const chunks = transcriptXML.getElementsByTagName("text"); + for (const chunk of chunks) { + transcript += chunk.textContent; + } + + return transcript; + } catch (e) { + throw new YoutubeTranscriptError(e); + } + } + + static #parseTranscriptEndpoint(document, langCode = null) { + try { + // Get all script tags on document page + const scripts = document.getElementsByTagName("script"); + + // find the player data script. + const playerScript = scripts.find((script) => + script.textContent.includes("var ytInitialPlayerResponse = {") + ); + + const dataString = + playerScript.textContent + ?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {.... + ?.split("};")?.[0] + // chunk off any code after object closure. + "}"; // add back that curly brace we just cut. + + const data = JSON.parse(dataString.trim()); // Attempt a JSON parse + const availableCaptions = + data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || []; + + // If languageCode was specified then search for it's code, otherwise get the first. + let captionTrack = availableCaptions?.[0]; + if (langCode) + captionTrack = + availableCaptions.find((track) => + track.languageCode.includes(langCode) + ) ?? availableCaptions?.[0]; + + return captionTrack?.baseUrl; + } catch (e) { + console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`); + return null; + } + } + + /** + * Retrieve video id from url or string + * @param videoId video url or video id + */ + static retrieveVideoId(videoId) { + if (videoId.length === 11) { + return videoId; + } + const matchId = videoId.match(RE_YOUTUBE); + if (matchId && matchId.length) { + return matchId[1]; + } + throw new YoutubeTranscriptError( + "Impossible to retrieve Youtube video ID." + ); + } +} + +module.exports = { + YoutubeTranscript, + YoutubeTranscriptError, +}; diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js index df019816..10c08b61 100644 --- a/collector/utils/extensions/YoutubeTranscript/index.js +++ b/collector/utils/extensions/YoutubeTranscript/index.js @@ -1,17 +1,17 @@ -const { YoutubeLoader } = require("langchain/document_loaders/web/youtube"); const fs = require("fs"); const path = require("path"); const { default: slugify } = require("slugify"); const { v4 } = require("uuid"); const { writeToServerDocuments } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); +const { YoutubeLoader } = require("./YoutubeLoader"); function validYoutubeVideoUrl(link) { const UrlPattern = require("url-pattern"); const opts = new URL(link); - const url = `${opts.protocol}//${opts.host}${ - opts.pathname - }?v=${opts.searchParams.get("v")}`; + const url = `${opts.protocol}//${opts.host}${opts.pathname}${ + opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : "" + }`; const shortPatternMatch = new UrlPattern( "https\\://(www.)youtu.be/(:videoId)" @@ -56,9 +56,7 @@ async function loadYouTubeTranscript({ url }) { } const metadata = docs[0].metadata; - let content = ""; - docs.forEach((doc) => (content = content.concat(doc.pageContent))); - + const content = docs[0].pageContent; if (!content.length) { return { success: false, diff --git a/collector/yarn.lock b/collector/yarn.lock index f7b7b696..0938c995 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -503,6 +503,11 @@ body-parser@^1.20.2: type-is "~1.6.18" unpipe "1.0.0" +boolbase@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e" + integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww== + brace-expansion@^1.1.7: version "1.1.11" resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" @@ -589,11 +594,6 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== -centra@^2.6.0: - version "2.6.0" - resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a" - integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ== - chalk@^2.4.2: version "2.4.2" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" @@ -796,6 +796,22 @@ crypt@0.0.2: resolved "https://registry.yarnpkg.com/crypt/-/crypt-0.0.2.tgz#88d7ff7ec0dfb86f713dc87bbb42d044d3e6c41b" integrity sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow== +css-select@^5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6" + integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg== + dependencies: + boolbase "^1.0.0" + css-what "^6.1.0" + domhandler "^5.0.2" + domutils "^3.0.1" + nth-check "^2.0.1" + +css-what@^6.1.0: + version "6.1.0" + resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4" + integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw== + data-uri-to-buffer@^6.0.0: version "6.0.1" resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-6.0.1.tgz#540bd4c8753a25ee129035aebdedf63b078703c7" @@ -2244,6 +2260,14 @@ node-forge@^1.3.1: resolved "https://registry.yarnpkg.com/node-forge/-/node-forge-1.3.1.tgz#be8da2af243b2417d5f646a770663a92b7e9ded3" integrity sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA== +node-html-parser@^6.1.13: + version "6.1.13" + resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-6.1.13.tgz#a1df799b83df5c6743fcd92740ba14682083b7e4" + integrity sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg== + dependencies: + css-select "^5.1.0" + he "1.2.0" + nodemailer@6.9.3: version "6.9.3" resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.3.tgz#e4425b85f05d83c43c5cd81bf84ab968f8ef5cbe" @@ -2294,6 +2318,13 @@ npmlog@^5.0.1: gauge "^3.0.0" set-blocking "^2.0.0" +nth-check@^2.0.1: + version "2.1.1" + resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d" + integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w== + dependencies: + boolbase "^1.0.0" + num-sort@^2.0.0: version "2.1.0" resolved "https://registry.yarnpkg.com/num-sort/-/num-sort-2.1.0.tgz#1cbb37aed071329fdf41151258bc011898577a9b" @@ -2522,13 +2553,6 @@ pend@~1.2.0: resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50" integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg== -phin@^3.5.0: - version "3.7.0" - resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b" - integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg== - dependencies: - centra "^2.6.0" - picomatch@^2.0.4, picomatch@^2.2.1: version "2.3.1" resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42" @@ -3421,13 +3445,6 @@ yauzl@^2.10.0, yauzl@^2.4.2: buffer-crc32 "~0.2.3" fd-slicer "~1.1.0" -youtube-transcript@^1.0.6: - version "1.0.6" - resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14" - integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA== - dependencies: - phin "^3.5.0" - youtubei.js@^9.1.0: version "9.1.0" resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e" diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx index 15e53aa5..40450d4e 100644 --- a/frontend/src/components/SettingsSidebar/index.jsx +++ b/frontend/src/components/SettingsSidebar/index.jsx @@ -74,10 +74,11 @@ export default function SettingsSidebar() { className={`z-99 fixed top-0 left-0 transition-all duration-500 w-[100vw] h-[100vh]`} >
setShowSidebar(false)} />
{React.cloneElement(icon, { weight: isActive ? "fill" : "regular" })} @@ -204,8 +206,9 @@ const Option = ({
{!!subOptions && (isActive || hasActiveChild) && (
{subOptions}