const fs = require("fs"); const path = require("path"); const { default: slugify } = require("slugify"); const { v4 } = require("uuid"); const { writeToServerDocuments } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); const { YoutubeLoader } = require("./YoutubeLoader"); function validYoutubeVideoUrl(link) { const UrlPattern = require("url-pattern"); const opts = new URL(link); const url = `${opts.protocol}//${opts.host}${opts.pathname}${ opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : "" }`; const shortPatternMatch = new UrlPattern( "https\\://(www.)youtu.be/(:videoId)" ).match(url); const fullPatternMatch = new UrlPattern( "https\\://(www.)youtube.com/watch?v=(:videoId)" ).match(url); const videoId = shortPatternMatch?.videoId || fullPatternMatch?.videoId || null; if (!!videoId) return true; return false; } async function fetchVideoTranscriptContent({ url }) { if (!validYoutubeVideoUrl(url)) { return { success: false, reason: "Invalid URL. Should be youtu.be or youtube.com/watch.", content: null, metadata: {}, }; } console.log(`-- Working YouTube ${url} --`); const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true }); const { docs, error } = await loader .load() .then((docs) => { return { docs, error: null }; }) .catch((e) => { return { docs: [], error: e.message?.split("Error:")?.[1] || e.message, }; }); if (!docs.length || !!error) { return { success: false, reason: error ?? "No transcript found for that YouTube video.", content: null, metadata: {}, }; } const metadata = docs[0].metadata; const content = docs[0].pageContent; if (!content.length) { return { success: false, reason: "No transcript could be parsed for that YouTube video.", content: null, metadata: {}, }; } return { success: true, reason: null, content, metadata, }; } async function loadYouTubeTranscript({ url }) { const transcriptResults = await fetchVideoTranscriptContent({ url }); if (!transcriptResults.success) { return { success: false, reason: transcriptResults.reason || "An unknown error occurred during transcription retrieval", }; } const { content, metadata } = transcriptResults; const outFolder = slugify( `${metadata.author} YouTube transcripts` ).toLowerCase(); const outFolderPath = process.env.NODE_ENV === "development" ? path.resolve( __dirname, `../../../../server/storage/documents/${outFolder}` ) : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`); if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath, { recursive: true }); const data = { id: v4(), url: url + ".youtube", title: metadata.title || url, docAuthor: metadata.author, description: metadata.description, docSource: url, chunkSource: `youtube://${url}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, token_count_estimate: tokenizeString(content).length, }; console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`); writeToServerDocuments( data, `${slugify(metadata.title)}-${data.id}`, outFolderPath ); return { success: true, reason: "test", data: { title: metadata.title, author: metadata.author, destination: outFolder, }, }; } module.exports = { loadYouTubeTranscript, fetchVideoTranscriptContent, };