anything-llm/collector/utils/extensions/GithubRepo/index.js

160 lines
4.3 KiB
JavaScript
Raw Normal View History

const RepoLoader = require("./RepoLoader");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
[BETA] Live document sync (#1719) * wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2024-06-21 22:38:50 +02:00
/**
* Load in a Github Repo recursively or just the top level if no PAT is provided
* @param {object} args - forwarded request body params
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns
*/
async function loadGithubRepo(args, response) {
const repo = new RepoLoader(args);
await repo.init();
if (!repo.ready)
return {
success: false,
reason: "Could not prepare Github repo for loading! Check URL",
};
console.log(
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} --`
);
const docs = await repo.recursiveLoader();
if (!docs.length) {
return {
success: false,
reason: "No files were found for those settings.",
};
}
console.log(`[Github Loader]: Found ${docs.length} source files. Saving...`);
const outFolder = slugify(
`${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
).toLowerCase();
2024-05-02 23:03:10 +02:00
const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });
for (const doc of docs) {
if (!doc.pageContent) continue;
const data = {
id: v4(),
url: "github://" + doc.metadata.source,
title: doc.metadata.source,
docAuthor: repo.author,
description: "No description found.",
docSource: doc.metadata.source,
[BETA] Live document sync (#1719) * wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2024-06-21 22:38:50 +02:00
chunkSource: generateChunkSource(
repo,
doc,
response.locals.encryptionWorker
),
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
);
writeToServerDocuments(
data,
`${slugify(doc.metadata.source)}-${data.id}`,
outFolderPath
);
}
return {
success: true,
reason: null,
data: {
author: repo.author,
repo: repo.project,
branch: repo.branch,
files: docs.length,
destination: outFolder,
},
};
}
[BETA] Live document sync (#1719) * wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2024-06-21 22:38:50 +02:00
/**
* Gets the page content from a specific source file in a give Github Repo, not all items in a repo.
* @returns
*/
async function fetchGithubFile({
repoUrl,
branch,
accessToken = null,
sourceFilePath,
}) {
const repo = new RepoLoader({
repo: repoUrl,
branch,
accessToken,
});
await repo.init();
if (!repo.ready)
return {
success: false,
content: null,
reason: "Could not prepare Github repo for loading! Check URL or PAT.",
};
console.log(
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} file:${sourceFilePath} --`
);
const fileContent = await repo.fetchSingleFile(sourceFilePath);
if (!fileContent) {
return {
success: false,
reason: "Target file returned a null content response.",
content: null,
};
}
return {
success: true,
reason: null,
content: fileContent,
};
}
/**
* Generate the full chunkSource for a specific file so that we can resync it later.
* This data is encrypted into a single `payload` query param so we can replay credentials later
* since this was encrypted with the systems persistent password and salt.
* @param {RepoLoader} repo
* @param {import("@langchain/core/documents").Document} doc
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
* @returns {string}
*/
function generateChunkSource(repo, doc, encryptionWorker) {
const payload = {
owner: repo.author,
project: repo.project,
branch: repo.branch,
path: doc.metadata.source,
pat: !!repo.accessToken ? repo.accessToken : null,
};
return `github://${repo.repo}?payload=${encryptionWorker.encrypt(
JSON.stringify(payload)
)}`;
}
module.exports = { loadGithubRepo, fetchGithubFile };