mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-15 10:50:31 +01:00
dc4ad6b5a9
* wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
143 lines
3.6 KiB
JavaScript
143 lines
3.6 KiB
JavaScript
const fs = require("fs");
|
|
const path = require("path");
|
|
const { default: slugify } = require("slugify");
|
|
const { v4 } = require("uuid");
|
|
const { writeToServerDocuments } = require("../../files");
|
|
const { tokenizeString } = require("../../tokenizer");
|
|
const { YoutubeLoader } = require("./YoutubeLoader");
|
|
|
|
function validYoutubeVideoUrl(link) {
|
|
const UrlPattern = require("url-pattern");
|
|
const opts = new URL(link);
|
|
const url = `${opts.protocol}//${opts.host}${opts.pathname}${
|
|
opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
|
|
}`;
|
|
|
|
const shortPatternMatch = new UrlPattern(
|
|
"https\\://(www.)youtu.be/(:videoId)"
|
|
).match(url);
|
|
const fullPatternMatch = new UrlPattern(
|
|
"https\\://(www.)youtube.com/watch?v=(:videoId)"
|
|
).match(url);
|
|
const videoId =
|
|
shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
|
|
if (!!videoId) return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
async function fetchVideoTranscriptContent({ url }) {
|
|
if (!validYoutubeVideoUrl(url)) {
|
|
return {
|
|
success: false,
|
|
reason: "Invalid URL. Should be youtu.be or youtube.com/watch.",
|
|
content: null,
|
|
metadata: {},
|
|
};
|
|
}
|
|
|
|
console.log(`-- Working YouTube ${url} --`);
|
|
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
|
|
const { docs, error } = await loader
|
|
.load()
|
|
.then((docs) => {
|
|
return { docs, error: null };
|
|
})
|
|
.catch((e) => {
|
|
return {
|
|
docs: [],
|
|
error: e.message?.split("Error:")?.[1] || e.message,
|
|
};
|
|
});
|
|
|
|
if (!docs.length || !!error) {
|
|
return {
|
|
success: false,
|
|
reason: error ?? "No transcript found for that YouTube video.",
|
|
content: null,
|
|
metadata: {},
|
|
};
|
|
}
|
|
|
|
const metadata = docs[0].metadata;
|
|
const content = docs[0].pageContent;
|
|
if (!content.length) {
|
|
return {
|
|
success: false,
|
|
reason: "No transcript could be parsed for that YouTube video.",
|
|
content: null,
|
|
metadata: {},
|
|
};
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
content,
|
|
metadata,
|
|
};
|
|
}
|
|
|
|
async function loadYouTubeTranscript({ url }) {
|
|
const transcriptResults = await fetchVideoTranscriptContent({ url });
|
|
if (!transcriptResults.success) {
|
|
return {
|
|
success: false,
|
|
reason:
|
|
transcriptResults.reason ||
|
|
"An unknown error occurred during transcription retrieval",
|
|
};
|
|
}
|
|
const { content, metadata } = transcriptResults;
|
|
const outFolder = slugify(
|
|
`${metadata.author} YouTube transcripts`
|
|
).toLowerCase();
|
|
|
|
const outFolderPath =
|
|
process.env.NODE_ENV === "development"
|
|
? path.resolve(
|
|
__dirname,
|
|
`../../../../server/storage/documents/${outFolder}`
|
|
)
|
|
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
|
|
|
|
if (!fs.existsSync(outFolderPath))
|
|
fs.mkdirSync(outFolderPath, { recursive: true });
|
|
|
|
const data = {
|
|
id: v4(),
|
|
url: url + ".youtube",
|
|
title: metadata.title || url,
|
|
docAuthor: metadata.author,
|
|
description: metadata.description,
|
|
docSource: url,
|
|
chunkSource: `youtube://${url}`,
|
|
published: new Date().toLocaleString(),
|
|
wordCount: content.split(" ").length,
|
|
pageContent: content,
|
|
token_count_estimate: tokenizeString(content).length,
|
|
};
|
|
|
|
console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
|
|
writeToServerDocuments(
|
|
data,
|
|
`${slugify(metadata.title)}-${data.id}`,
|
|
outFolderPath
|
|
);
|
|
|
|
return {
|
|
success: true,
|
|
reason: "test",
|
|
data: {
|
|
title: metadata.title,
|
|
author: metadata.author,
|
|
destination: outFolder,
|
|
},
|
|
};
|
|
}
|
|
|
|
module.exports = {
|
|
loadYouTubeTranscript,
|
|
fetchVideoTranscriptContent,
|
|
};
|