improve error messages from YT scraping (#768)

parse & enforce URL to allow multiple URL schemas
This commit is contained in:
Timothy Carambat 2024-02-21 10:47:10 -08:00 committed by GitHub
parent 49fbd09af4
commit d89610586a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -6,11 +6,15 @@ const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
function validYoutubeVideoUrl(url) {
function validYoutubeVideoUrl(link) {
const UrlPattern = require("url-pattern");
const opts = new URL(link);
const url = `${opts.protocol}//${opts.host}${
opts.pathname
}?v=${opts.searchParams.get("v")}`;
const shortPatternMatch = new UrlPattern(
"https\\://youtu.be/(:videoId)"
"https\\://(www.)youtu.be/(:videoId)"
).match(url);
const fullPatternMatch = new UrlPattern(
"https\\://(www.)youtube.com/watch?v=(:videoId)"
@ -32,12 +36,22 @@ async function loadYouTubeTranscript({ url }) {
console.log(`-- Working YouTube ${url} --`);
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
const docs = await loader.load();
const { docs, error } = await loader
.load()
.then((docs) => {
return { docs, error: null };
})
.catch((e) => {
return {
docs: [],
error: e.message?.split("Error:")?.[1] || e.message,
};
});
if (!docs.length) {
if (!docs.length || !!error) {
return {
success: false,
reason: "No transcript found for that YouTube video.",
reason: error ?? "No transcript found for that YouTube video.",
};
}