Add ability to grab youtube transcripts via doc processor (#470)

* Add ability to grab youtube transcripts via doc processor

* dynamic imports
swap out Github for Youtube in placeholder text
This commit is contained in:
Timothy Carambat 2023-12-18 17:17:26 -08:00 committed by GitHub
parent 452582489e
commit ecf4295537
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 353 additions and 17 deletions

View File

@ -47,6 +47,25 @@ function extensions(app) {
}
return;
});
app.post("/ext/youtube-transcript", async function (request, response) {
try {
const loadYouTubeTranscript = require("../utils/extensions/YoutubeTranscript");
const { success, reason, data } = await loadYouTubeTranscript(reqBody(request));
response.status(200).json({ success, reason, data });
} catch (e) {
console.error(e);
response.status(400).json({
success: false,
reason: e.message,
data: {
title: null,
author: null
}
});
}
return;
});
}
module.exports = extensions;

View File

@ -38,7 +38,9 @@
"slugify": "^1.6.6",
"url-pattern": "^1.0.3",
"uuid": "^9.0.0",
"wavefile": "^11.0.0"
"wavefile": "^11.0.0",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^8.0.0"
},
"devDependencies": {
"nodemon": "^2.0.22",

View File

@ -0,0 +1,95 @@
const { YoutubeLoader } = require("langchain/document_loaders/web/youtube");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
function validYoutubeVideoUrl(url) {
const UrlPattern = require("url-pattern");
const shortPatternMatch = new UrlPattern(
"https\\://youtu.be/(:videoId)"
).match(url);
const fullPatternMatch = new UrlPattern(
"https\\://(www.)youtube.com/watch?v=(:videoId)"
).match(url);
const videoId =
shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
if (!!videoId) return true;
return false;
}
async function loadYouTubeTranscript({ url }) {
if (!validYoutubeVideoUrl(url)) {
return {
success: false,
reason: "Invalid URL. Should be youtu.be or youtube.com/watch.",
};
}
console.log(`-- Working YouTube ${url} --`);
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
const docs = await loader.load();
if (!docs.length) {
return {
success: false,
reason: "No transcript found for that YouTube video.",
};
}
const metadata = docs[0].metadata;
let content = "";
docs.forEach((doc) => (content = content.concat(doc.pageContent)));
if (!content.length) {
return {
success: false,
reason: "No transcript could be parsed for that YouTube video.",
};
}
const outFolder = slugify(
`${metadata.author} YouTube transcripts`
).toLowerCase();
const outFolderPath = path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
);
if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath);
const data = {
id: v4(),
url: url + ".youtube",
title: metadata.title || url,
docAuthor: metadata.author,
description: metadata.description,
docSource: url,
chunkSource: url,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};
console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
writeToServerDocuments(
data,
`${slugify(metadata.title)}-${data.id}`,
outFolderPath
);
return {
success: true,
reason: "test",
data: {
title: metadata.title,
author: metadata.author,
},
};
}
module.exports = loadYouTubeTranscript;

View File

@ -39,6 +39,11 @@
chalk "^2.4.2"
js-tokens "^4.0.0"
"@fastify/busboy@^2.0.0":
version "2.1.0"
resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.0.tgz#0709e9f4cb252351c609c6e6d8d6779a8d25edff"
integrity sha512-+KpH+QxZU7O4675t3mnkQKcZZg56u+K/Ct2K+N2AZYNVK8kyeo/bI18tI8aPm3tvNNRyTWfj6s5tnGNlcbQRsA==
"@googleapis/youtube@^9.0.0":
version "9.0.0"
resolved "https://registry.yarnpkg.com/@googleapis/youtube/-/youtube-9.0.0.tgz#e45f6f5f7eac198c6391782b94b3ca54bacf0b63"
@ -252,6 +257,11 @@ accepts@~1.3.8:
mime-types "~2.1.34"
negotiator "0.6.3"
acorn@^8.8.0:
version "8.11.2"
resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.11.2.tgz#ca0d78b51895be5390a5903c5b3bdcdaf78ae40b"
integrity sha512-nc0Axzp/0FILLEVsm4fNwLCwMttvhEI263QtVPQcbpfZZ3ts0hLsZGOpE6czNlid7CJ9MlyH8reXkpsf3YUY4w==
agent-base@6:
version "6.0.2"
resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-6.0.2.tgz#49fff58577cfee3f37176feab4c22e00f86d7f77"
@ -554,6 +564,11 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
centra@^2.6.0:
version "2.6.0"
resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a"
integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ==
chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
@ -1655,6 +1670,13 @@ isexe@^2.0.0:
resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"
integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==
jintr@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3"
integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg==
dependencies:
acorn "^8.8.0"
js-tiktoken@^1.0.7:
version "1.0.7"
resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5"
@ -2431,6 +2453,13 @@ pend@~1.2.0:
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==
phin@^3.5.0:
version "3.7.0"
resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b"
integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg==
dependencies:
centra "^2.6.0"
picomatch@^2.0.4, picomatch@^2.2.1:
version "2.3.1"
resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42"
@ -3069,7 +3098,7 @@ tr46@~0.0.3:
resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a"
integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==
tslib@^2.0.1:
tslib@^2.0.1, tslib@^2.5.0:
version "2.6.2"
resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae"
integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==
@ -3122,6 +3151,13 @@ undici-types@~5.26.4:
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.26.5.tgz#bcd539893d00b56e964fd2657a4866b221a65617"
integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==
undici@^5.19.1:
version "5.28.2"
resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.2.tgz#fea200eac65fc7ecaff80a023d1a0543423b4c91"
integrity sha512-wh1pHJHnUeQV5Xa8/kyQhO7WFa8M34l026L5P/+2TYiakvGy5Rdc8jWZVyG7ieht/0WgJLEd3kcU5gKx+6GC8w==
dependencies:
"@fastify/busboy" "^2.0.0"
universalify@^0.1.0:
version "0.1.2"
resolved "https://registry.yarnpkg.com/universalify/-/universalify-0.1.2.tgz#b646f69be3942dabcecc9d6639c80dc105efaa66"
@ -3279,6 +3315,22 @@ yauzl@^2.10.0, yauzl@^2.4.2:
buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0"
youtube-transcript@^1.0.6:
version "1.0.6"
resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14"
integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA==
dependencies:
phin "^3.5.0"
youtubei.js@^8.0.0:
version "8.0.0"
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-8.0.0.tgz#0fcbe332e263d9be6afe4e3d1917e9ddc1ffbed3"
integrity sha512-kUwHvqoB5vfaGaY1quAGcX5JPIyjr5fjj9Zj/ZwUDCrermz/r5uIkNiJ5cNHkmAJbZP9fdygzNMvGHd7fM445g==
dependencies:
jintr "^1.1.0"
tslib "^2.5.0"
undici "^5.19.1"
zod-to-json-schema@3.20.3:
version "3.20.3"
resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.20.3.tgz#8c95d8c20f20455ffa0b4b526c29703f35f6d787"

View File

@ -36,4 +36,12 @@ export const DATA_CONNECTORS = {
"Import an entire public or private Github repository in a single click.",
link: "https://github.com",
},
"youtube-transcript": {
name: "YouTube Transcript",
path: paths.settings.dataConnectors.youtubeTranscript(),
image: ConnectorImages.youtube,
description:
"Import the transcription of an entire YouTube video from a link.",
link: "https://youtube.com",
},
};

View File

@ -1,5 +1,9 @@
import Github from "./github.png";
import YouTube from "./youtube.png";
const ConnectorImages = {
github: Github,
youtube: YouTube,
};
export default ConnectorImages;

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

View File

@ -60,16 +60,19 @@ export default function FileRow({
selected ? "bg-sky-500/20" : ""
} ${expanded ? "bg-sky-500/10" : ""}`}`}
>
<div className="pl-4 col-span-4 flex gap-x-[4px] items-center">
<div className="pl-2 col-span-6 flex gap-x-[4px] items-center">
<div
className="w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
role="checkbox"
aria-checked={selected}
tabIndex={0}
>
{selected && <div className="w-2 h-2 bg-white rounded-[2px]" />}
</div>
<File className="text-base font-bold w-4 h-4 mr-[3px]" weight="fill" />
<File
className="shrink-0 text-base font-bold w-4 h-4 mr-[3px]"
weight="fill"
/>
<div
className="relative"
onMouseEnter={handleMouseEnter}
@ -88,7 +91,6 @@ export default function FileRow({
<p className="col-span-2 pl-3.5 whitespace-nowrap">
{formatDate(item?.published)}
</p>
<p className="col-span-2 pl-3">{item?.size || "---"}</p>
<p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p>
<div className="col-span-2 flex justify-end items-center">
{item?.cached && (

View File

@ -53,7 +53,7 @@ export default function FolderRow({
selected ? "bg-sky-500/20" : ""
}`}
>
<div className="col-span-4 flex gap-x-[4px] items-center">
<div className="col-span-6 flex gap-x-[4px] items-center">
<div
className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
role="checkbox"
@ -79,7 +79,6 @@ export default function FolderRow({
</p>
</div>
<p className="col-span-2 pl-3.5" />
<p className="col-span-2 pl-3" />
<p className="col-span-2 pl-2" />
<div className="col-span-2 flex justify-end items-center">
{item.name !== "custom-documents" && (

View File

@ -71,9 +71,8 @@ export default function Directory({
<div className="relative w-[560px] h-[310px] bg-zinc-900 rounded-2xl">
<div className="rounded-t-2xl text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 shadow-lg bg-zinc-900 sticky top-0 z-10">
<p className="col-span-4">Name</p>
<p className="col-span-6">Name</p>
<p className="col-span-2">Date</p>
<p className="col-span-2">Size</p>
<p className="col-span-2">Kind</p>
<p className="col-span-2">Cached</p>
</div>

View File

@ -54,7 +54,7 @@ export default function WorkspaceFileRow({
className={`items-center transition-all duration-200 text-white/80 text-xs grid grid-cols-12 py-2 pl-3.5 pr-8 border-b border-white/20 hover:bg-sky-500/20 cursor-pointer
${isMovedItem ? "bg-green-800/40" : ""}`}
>
<div className="col-span-4 flex gap-x-[4px] items-center">
<div className="col-span-6 flex gap-x-[4px] items-center">
<File
className="text-base font-bold w-4 h-4 ml-3 mr-[3px]"
weight="fill"
@ -77,7 +77,6 @@ export default function WorkspaceFileRow({
<p className="col-span-2 pl-3.5 whitespace-nowrap">
{formatDate(item?.published)}
</p>
<p className="col-span-2 pl-3">{item?.size || "---"}</p>
<p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p>
<div className="col-span-2 flex justify-end items-center">
{item?.cached && (

View File

@ -26,9 +26,8 @@ export default function WorkspaceDirectory({
</div>
<div className="relative w-[560px] h-[445px] bg-zinc-900 rounded-2xl mt-5">
<div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20">
<p className="col-span-4">Name</p>
<p className="col-span-6">Name</p>
<p className="col-span-2">Date</p>
<p className="col-span-2">Size</p>
<p className="col-span-2">Kind</p>
<p className="col-span-2">Cached</p>
</div>
@ -56,9 +55,8 @@ export default function WorkspaceDirectory({
}`}
>
<div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 bg-zinc-900 sticky top-0 z-10">
<p className="col-span-4">Name</p>
<p className="col-span-6">Name</p>
<p className="col-span-2">Date</p>
<p className="col-span-2">Size</p>
<p className="col-span-2">Kind</p>
<p className="col-span-2">Cached</p>
</div>

View File

@ -42,6 +42,24 @@ const DataConnector = {
});
},
},
youtube: {
transcribe: async ({ url }) => {
return await fetch(`${API_BASE}/ext/youtube/transcript`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({ url }),
})
.then((res) => res.json())
.then((res) => {
if (!res.success) throw new Error(res.reason);
return { data: res.data, error: null };
})
.catch((e) => {
console.error(e);
return { data: null, error: e.message };
});
},
},
};
export default DataConnector;

View File

@ -0,0 +1,114 @@
import React, { useState } from "react";
import Sidebar, { SidebarMobileHeader } from "@/components/SettingsSidebar";
import { isMobile } from "react-device-detect";
import { DATA_CONNECTORS } from "@/components/DataConnectorOption";
import System from "@/models/system";
import showToast from "@/utils/toast";
export default function YouTubeTranscriptConnectorSetup() {
const { image } = DATA_CONNECTORS["youtube-transcript"];
const [loading, setLoading] = useState(false);
const handleSubmit = async (e) => {
e.preventDefault();
const form = new FormData(e.target);
try {
setLoading(true);
showToast("Fetching transcript for YouTube video.", "info", {
clear: true,
autoClose: false,
});
const { data, error } = await System.dataConnectors.youtube.transcribe({
url: form.get("url"),
});
if (!!error) {
showToast(error, "error", { clear: true });
setLoading(false);
return;
}
showToast(
`${data.title} by ${data.author} transcription completed. Output folder is ${data.destination}.`,
"success",
{ clear: true }
);
e.target.reset();
setLoading(false);
return;
} catch (e) {
console.error(e);
showToast(e.message, "error", { clear: true });
setLoading(false);
}
};
return (
<div className="w-screen h-screen overflow-hidden bg-sidebar flex">
{!isMobile && <Sidebar />}
<div
style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[26px] bg-main-gradient w-full h-full overflow-y-scroll border-4 border-accent"
>
{isMobile && <SidebarMobileHeader />}
<div className="flex w-full">
<div className="flex flex-col w-full px-1 md:px-20 md:py-12 py-16">
<div className="flex w-full gap-x-4 items-center pb-6 border-white border-b-2 border-opacity-10">
<img src={image} alt="YouTube" className="rounded-lg h-16 w-16" />
<div className="w-full flex flex-col gap-y-1">
<div className="items-center flex gap-x-4">
<p className="text-2xl font-semibold text-white">
Import YouTube transcription
</p>
</div>
<p className="text-sm font-base text-white text-opacity-60">
From a youtube link, import the entire transcript of that
video for embedding.
</p>
</div>
</div>
<form className="w-full" onSubmit={handleSubmit}>
<div className="w-full flex flex-col py-2">
<div className="w-full flex items-center gap-4">
<div className="flex flex-col w-60">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-semibold block">
YouTube video URL
</label>
</div>
<input
type="url"
name="url"
className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
placeholder="https://youtube.com/watch?v=abc123"
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
</div>
</div>
<div className="flex flex-col gap-y-2 w-fit">
<button
type="submit"
disabled={loading}
className="mt-2 text-lg w-fit border border-slate-200 px-4 py-1 rounded-lg text-slate-200 items-center flex gap-x-2 hover:bg-slate-200 hover:text-slate-800 disabled:bg-slate-200 disabled:text-slate-800"
>
{loading ? "Collecting transcript..." : "Collect transcript"}
</button>
{loading && (
<p className="text-xs text-zinc-300">
Once complete, the transcription will be available for
embedding into workspaces in the document picker.
</p>
)}
</div>
</form>
</div>
</div>
</div>
</div>
);
}

View File

@ -2,9 +2,11 @@ import paths from "@/utils/paths";
import { lazy } from "react";
import { useParams } from "react-router-dom";
const Github = lazy(() => import("./Github"));
const YouTubeTranscript = lazy(() => import("./Youtube"));
const CONNECTORS = {
github: Github,
"youtube-transcript": YouTubeTranscript,
};
export default function DataConnectorSetup() {

View File

@ -29,6 +29,7 @@ export default function DataConnectors() {
</div>
<div className="py-4 w-full flex md:flex-wrap overflow-x-scroll gap-4 max-w-full">
<DataConnectorOption slug="github" />
<DataConnectorOption slug="youtube-transcript" />
</div>
</div>
</div>

View File

@ -13,7 +13,7 @@ export function getFileExtension(path) {
export function middleTruncate(str, n) {
const fileExtensionPattern = /([^.]*)$/;
const extensionMatch = str.match(fileExtensionPattern);
const extensionMatch = str.includes(".") && str.match(fileExtensionPattern);
if (str.length <= n) return str;

View File

@ -83,6 +83,9 @@ export default {
github: () => {
return "/settings/data-connectors/github";
},
youtubeTranscript: () => {
return "/settings/data-connectors/youtube-transcript";
},
},
},
};

View File

@ -48,6 +48,27 @@ function extensionEndpoints(app) {
}
}
);
app.post(
"/ext/youtube/transcript",
[validatedRequest, flexUserRoleValid],
async (request, response) => {
try {
const responseFromProcessor = await forwardExtensionRequest({
endpoint: "/ext/youtube-transcript",
method: "POST",
body: request.body,
});
await Telemetry.sendTelemetry("extension_invoked", {
type: "youtube_transcript",
});
response.status(200).json(responseFromProcessor);
} catch (e) {
console.error(e);
response.sendStatus(500).end();
}
}
);
}
module.exports = { extensionEndpoints };