diff --git a/collector/package.json b/collector/package.json
index 7c82014a4..4a5a99fff 100644
--- a/collector/package.json
+++ b/collector/package.json
@@ -34,6 +34,7 @@
"mime": "^3.0.0",
"moment": "^2.29.4",
"multer": "^1.4.5-lts.1",
+ "node-html-parser": "^6.1.13",
"officeparser": "^4.0.5",
"openai": "^3.2.1",
"pdf-parse": "^1.1.1",
@@ -42,11 +43,10 @@
"url-pattern": "^1.0.3",
"uuid": "^9.0.0",
"wavefile": "^11.0.0",
- "youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
},
"devDependencies": {
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
-}
+}
\ No newline at end of file
diff --git a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js
new file mode 100644
index 000000000..aac94eb48
--- /dev/null
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js
@@ -0,0 +1,90 @@
+/*
+ * This is just a custom implementation of the Langchain JS YouTubeLoader class
+ * as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
+ * and instead of waiting for patches we can just bring this simple script in-house and at least
+ * be able to patch it since its so flaky. When we have more connectors we can kill this because
+ * it will be a pain to maintain over time.
+ */
+class YoutubeLoader {
+ #videoId;
+ #language;
+ #addVideoInfo;
+
+ constructor({ videoId = null, language = null, addVideoInfo = false } = {}) {
+ if (!videoId) throw new Error("Invalid video id!");
+ this.#videoId = videoId;
+ this.#language = language;
+ this.#addVideoInfo = addVideoInfo;
+ }
+
+ /**
+ * Extracts the videoId from a YouTube video URL.
+ * @param url The URL of the YouTube video.
+ * @returns The videoId of the YouTube video.
+ */
+ static getVideoID(url) {
+ const match = url.match(
+ /.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
+ );
+ if (match !== null && match[1].length === 11) {
+ return match[1];
+ } else {
+ throw new Error("Failed to get youtube video id from the url");
+ }
+ }
+
+ /**
+ * Creates a new instance of the YoutubeLoader class from a YouTube video
+ * URL.
+ * @param url The URL of the YouTube video.
+ * @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.
+ * @returns A new instance of the YoutubeLoader class.
+ */
+ static createFromUrl(url, config = {}) {
+ const videoId = YoutubeLoader.getVideoID(url);
+ return new YoutubeLoader({ ...config, videoId });
+ }
+
+ /**
+ * Loads the transcript and video metadata from the specified YouTube
+ * video. It uses the youtube-transcript library to fetch the transcript
+ * and the youtubei.js library to fetch the video metadata.
+ * @returns Langchain like doc that is 1 element with PageContent and
+ */
+ async load() {
+ let transcript;
+ const metadata = {
+ source: this.#videoId,
+ };
+ try {
+ const { YoutubeTranscript } = require("./youtube-transcript");
+ transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, {
+ lang: this.#language,
+ });
+ if (!transcript) {
+ throw new Error("Transcription not found");
+ }
+ if (this.#addVideoInfo) {
+ const { Innertube } = require("youtubei.js");
+ const youtube = await Innertube.create();
+ const info = (await youtube.getBasicInfo(this.#videoId)).basic_info;
+ metadata.description = info.short_description;
+ metadata.title = info.title;
+ metadata.view_count = info.view_count;
+ metadata.author = info.author;
+ }
+ } catch (e) {
+ throw new Error(
+ `Failed to get YouTube video transcription: ${e?.message}`
+ );
+ }
+ return [
+ {
+ pageContent: transcript,
+ metadata,
+ },
+ ];
+ }
+}
+
+module.exports.YoutubeLoader = YoutubeLoader;
diff --git a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
new file mode 100644
index 000000000..c81c0ec56
--- /dev/null
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
@@ -0,0 +1,115 @@
+const { parse } = require("node-html-parser");
+const RE_YOUTUBE =
+ /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
+const USER_AGENT =
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
+
+class YoutubeTranscriptError extends Error {
+ constructor(message) {
+ super(`[YoutubeTranscript] ${message}`);
+ }
+}
+
+/**
+ * Class to retrieve transcript if exist
+ */
+class YoutubeTranscript {
+ /**
+ * Fetch transcript from YTB Video
+ * @param videoId Video url or video identifier
+ * @param config Object with lang param (eg: en, es, hk, uk) format.
+ * Will just the grab first caption if it can find one, so no special lang caption support.
+ */
+ static async fetchTranscript(videoId, config = {}) {
+ const identifier = this.retrieveVideoId(videoId);
+ const lang = config?.lang ?? "en";
+ try {
+ const transcriptUrl = await fetch(
+ `https://www.youtube.com/watch?v=${identifier}`,
+ {
+ headers: {
+ "User-Agent": USER_AGENT,
+ },
+ }
+ )
+ .then((res) => res.text())
+ .then((html) => parse(html))
+ .then((html) => this.#parseTranscriptEndpoint(html, lang));
+
+ if (!transcriptUrl)
+ throw new Error("Failed to locate a transcript for this video!");
+
+ // Result is hopefully some XML.
+ const transcriptXML = await fetch(transcriptUrl)
+ .then((res) => res.text())
+ .then((xml) => parse(xml));
+
+ let transcript = "";
+ const chunks = transcriptXML.getElementsByTagName("text");
+ for (const chunk of chunks) {
+ transcript += chunk.textContent;
+ }
+
+ return transcript;
+ } catch (e) {
+ throw new YoutubeTranscriptError(e);
+ }
+ }
+
+ static #parseTranscriptEndpoint(document, langCode = null) {
+ try {
+ // Get all script tags on document page
+ const scripts = document.getElementsByTagName("script");
+
+ // find the player data script.
+ const playerScript = scripts.find((script) =>
+ script.textContent.includes("var ytInitialPlayerResponse = {")
+ );
+
+ const dataString =
+ playerScript.textContent
+ ?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
+ ?.split("};")?.[0] + // chunk off any code after object closure.
+ "}"; // add back that curly brace we just cut.
+
+ const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
+ const availableCaptions =
+ data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
+
+ // If languageCode was specified then search for it's code, otherwise get the first.
+ let captionTrack = availableCaptions?.[0];
+ if (langCode)
+ captionTrack =
+ availableCaptions.find((track) =>
+ track.languageCode.includes(langCode)
+ ) ?? availableCaptions?.[0];
+
+ return captionTrack?.baseUrl;
+ } catch (e) {
+ console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
+ return null;
+ }
+ }
+
+ /**
+ * Retrieve video id from url or string
+ * @param videoId video url or video id
+ */
+ static retrieveVideoId(videoId) {
+ if (videoId.length === 11) {
+ return videoId;
+ }
+ const matchId = videoId.match(RE_YOUTUBE);
+ if (matchId && matchId.length) {
+ return matchId[1];
+ }
+ throw new YoutubeTranscriptError(
+ "Impossible to retrieve Youtube video ID."
+ );
+ }
+}
+
+module.exports = {
+ YoutubeTranscript,
+ YoutubeTranscriptError,
+};
diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js
index a44fe9b1e..b1622870c 100644
--- a/collector/utils/extensions/YoutubeTranscript/index.js
+++ b/collector/utils/extensions/YoutubeTranscript/index.js
@@ -1,17 +1,17 @@
-const { YoutubeLoader } = require("langchain/document_loaders/web/youtube");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments, documentsFolder } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
+const { YoutubeLoader } = require("./YoutubeLoader");
function validYoutubeVideoUrl(link) {
const UrlPattern = require("url-pattern");
const opts = new URL(link);
- const url = `${opts.protocol}//${opts.host}${
- opts.pathname
- }?v=${opts.searchParams.get("v")}`;
+ const url = `${opts.protocol}//${opts.host}${opts.pathname}${
+ opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
+ }`;
const shortPatternMatch = new UrlPattern(
"https\\://(www.)youtu.be/(:videoId)"
@@ -56,9 +56,7 @@ async function loadYouTubeTranscript({ url }) {
}
const metadata = docs[0].metadata;
- let content = "";
- docs.forEach((doc) => (content = content.concat(doc.pageContent)));
-
+ const content = docs[0].pageContent;
if (!content.length) {
return {
success: false,
diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js
index 4dcc8e39c..1263a59d0 100644
--- a/collector/utils/files/index.js
+++ b/collector/utils/files/index.js
@@ -84,7 +84,7 @@ async function wipeCollectorStorage() {
if (file === "__HOTDIR__.md") continue;
try {
fs.rmSync(path.join(directory, file));
- } catch { }
+ } catch {}
}
resolve();
});
@@ -99,7 +99,7 @@ async function wipeCollectorStorage() {
if (file === ".placeholder") continue;
try {
fs.rmSync(path.join(directory, file));
- } catch { }
+ } catch {}
}
resolve();
});
diff --git a/collector/yarn.lock b/collector/yarn.lock
index f7b7b696c..0938c995f 100644
--- a/collector/yarn.lock
+++ b/collector/yarn.lock
@@ -503,6 +503,11 @@ body-parser@^1.20.2:
type-is "~1.6.18"
unpipe "1.0.0"
+boolbase@^1.0.0:
+ version "1.0.0"
+ resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
+ integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==
+
brace-expansion@^1.1.7:
version "1.1.11"
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
@@ -589,11 +594,6 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
-centra@^2.6.0:
- version "2.6.0"
- resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a"
- integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ==
-
chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
@@ -796,6 +796,22 @@ crypt@0.0.2:
resolved "https://registry.yarnpkg.com/crypt/-/crypt-0.0.2.tgz#88d7ff7ec0dfb86f713dc87bbb42d044d3e6c41b"
integrity sha512-mCxBlsHFYh9C+HVpiEacem8FEBnMXgU9gy4zmNC+SXAZNB/1idgp/aulFJ4FgCi7GPEVbfyng092GqL2k2rmow==
+css-select@^5.1.0:
+ version "5.1.0"
+ resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6"
+ integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==
+ dependencies:
+ boolbase "^1.0.0"
+ css-what "^6.1.0"
+ domhandler "^5.0.2"
+ domutils "^3.0.1"
+ nth-check "^2.0.1"
+
+css-what@^6.1.0:
+ version "6.1.0"
+ resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
+ integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
+
data-uri-to-buffer@^6.0.0:
version "6.0.1"
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-6.0.1.tgz#540bd4c8753a25ee129035aebdedf63b078703c7"
@@ -2244,6 +2260,14 @@ node-forge@^1.3.1:
resolved "https://registry.yarnpkg.com/node-forge/-/node-forge-1.3.1.tgz#be8da2af243b2417d5f646a770663a92b7e9ded3"
integrity sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==
+node-html-parser@^6.1.13:
+ version "6.1.13"
+ resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-6.1.13.tgz#a1df799b83df5c6743fcd92740ba14682083b7e4"
+ integrity sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg==
+ dependencies:
+ css-select "^5.1.0"
+ he "1.2.0"
+
nodemailer@6.9.3:
version "6.9.3"
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.3.tgz#e4425b85f05d83c43c5cd81bf84ab968f8ef5cbe"
@@ -2294,6 +2318,13 @@ npmlog@^5.0.1:
gauge "^3.0.0"
set-blocking "^2.0.0"
+nth-check@^2.0.1:
+ version "2.1.1"
+ resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d"
+ integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==
+ dependencies:
+ boolbase "^1.0.0"
+
num-sort@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/num-sort/-/num-sort-2.1.0.tgz#1cbb37aed071329fdf41151258bc011898577a9b"
@@ -2522,13 +2553,6 @@ pend@~1.2.0:
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==
-phin@^3.5.0:
- version "3.7.0"
- resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b"
- integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg==
- dependencies:
- centra "^2.6.0"
-
picomatch@^2.0.4, picomatch@^2.2.1:
version "2.3.1"
resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42"
@@ -3421,13 +3445,6 @@ yauzl@^2.10.0, yauzl@^2.4.2:
buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0"
-youtube-transcript@^1.0.6:
- version "1.0.6"
- resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14"
- integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA==
- dependencies:
- phin "^3.5.0"
-
youtubei.js@^9.1.0:
version "9.1.0"
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e"
diff --git a/frontend/.gitignore b/frontend/.gitignore
index 196c8f691..787206034 100644
--- a/frontend/.gitignore
+++ b/frontend/.gitignore
@@ -12,6 +12,7 @@ dist
lib
dist-ssr
*.local
+!frontend/components/lib
# Editor directories and files
.vscode/*
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
index 9ef160e72..0a5ed65fc 100644
--- a/frontend/src/App.jsx
+++ b/frontend/src/App.jsx
@@ -35,16 +35,13 @@ const GeneralTranscriptionPreference = lazy(
const GeneralEmbeddingPreference = lazy(
() => import("@/pages/GeneralSettings/EmbeddingPreference")
);
+const EmbeddingTextSplitterPreference = lazy(
+ () => import("@/pages/GeneralSettings/EmbeddingTextSplitterPreference")
+);
const GeneralVectorDatabase = lazy(
() => import("@/pages/GeneralSettings/VectorDatabase")
);
const GeneralSecurity = lazy(() => import("@/pages/GeneralSettings/Security"));
-const DataConnectors = lazy(
- () => import("@/pages/GeneralSettings/DataConnectors")
-);
-const DataConnectorSetup = lazy(
- () => import("@/pages/GeneralSettings/DataConnectors/Connectors")
-);
const WorkspaceSettings = lazy(() => import("@/pages/WorkspaceSettings"));
const EmbedConfigSetup = lazy(
() => import("@/pages/GeneralSettings/EmbedConfigs")
@@ -92,6 +89,12 @@ export default function App() {
path="/settings/embedding-preference"
element={
- Anthropic as your LLM requires you to set an embedding service to - use. -
-+ Branch you wish to collect files from. +
++ Branch you wish to collect files from. +
+