anything-llm/collector/utils/extensions/GithubRepo/index.js
Timothy Carambat dc4ad6b5a9
[BETA] Live document sync (#1719)
* wip bg workers for live document sync

* Add ability to re-embed specific documents across many workspaces via background queue
bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled
UI for watching/unwatching docments that are embedded.
TODO: UI to easily manage all bg tasks and see run results
TODO: UI to enable this feature and background endpoints to manage it

* create frontend views and paths
Move elements to correct experimental scope

* update migration to delete runs on removal of watched document

* Add watch support to YouTube transcripts (#1716)

* Add watch support to YouTube transcripts
refactor how sync is done for supported types

* Watch specific files in Confluence space (#1718)

Add failure-prune check for runs

* create tmp workflow modifications for beta image

* create tmp workflow modifications for beta image

* create tmp workflow modifications for beta image

* dual build
update copy of alert modals

* update job interval

* Add support for live-sync of Github files

* update copy for document sync feature

* hide Experimental features from UI

* update docs links

* [FEAT] Implement new settings menu for experimental features (#1735)

* implement new settings menu for experimental features

* remove unused context save bar

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>

* dont run job on boot

* unset workflow changes

* Add persistent encryption service
Relay key to collector so persistent encryption can be used
Encrypt any private data in chunkSources used for replay during resync jobs

* update jsDOC

* Linting and organization

* update modal copy for feature

---------

Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2024-06-21 13:38:50 -07:00

160 lines
4.3 KiB
JavaScript

const RepoLoader = require("./RepoLoader");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
/**
* Load in a Github Repo recursively or just the top level if no PAT is provided
* @param {object} args - forwarded request body params
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns
*/
async function loadGithubRepo(args, response) {
const repo = new RepoLoader(args);
await repo.init();
if (!repo.ready)
return {
success: false,
reason: "Could not prepare Github repo for loading! Check URL",
};
console.log(
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} --`
);
const docs = await repo.recursiveLoader();
if (!docs.length) {
return {
success: false,
reason: "No files were found for those settings.",
};
}
console.log(`[Github Loader]: Found ${docs.length} source files. Saving...`);
const outFolder = slugify(
`${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
).toLowerCase();
const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });
for (const doc of docs) {
if (!doc.pageContent) continue;
const data = {
id: v4(),
url: "github://" + doc.metadata.source,
title: doc.metadata.source,
docAuthor: repo.author,
description: "No description found.",
docSource: doc.metadata.source,
chunkSource: generateChunkSource(
repo,
doc,
response.locals.encryptionWorker
),
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
);
writeToServerDocuments(
data,
`${slugify(doc.metadata.source)}-${data.id}`,
outFolderPath
);
}
return {
success: true,
reason: null,
data: {
author: repo.author,
repo: repo.project,
branch: repo.branch,
files: docs.length,
destination: outFolder,
},
};
}
/**
* Gets the page content from a specific source file in a give Github Repo, not all items in a repo.
* @returns
*/
async function fetchGithubFile({
repoUrl,
branch,
accessToken = null,
sourceFilePath,
}) {
const repo = new RepoLoader({
repo: repoUrl,
branch,
accessToken,
});
await repo.init();
if (!repo.ready)
return {
success: false,
content: null,
reason: "Could not prepare Github repo for loading! Check URL or PAT.",
};
console.log(
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} file:${sourceFilePath} --`
);
const fileContent = await repo.fetchSingleFile(sourceFilePath);
if (!fileContent) {
return {
success: false,
reason: "Target file returned a null content response.",
content: null,
};
}
return {
success: true,
reason: null,
content: fileContent,
};
}
/**
* Generate the full chunkSource for a specific file so that we can resync it later.
* This data is encrypted into a single `payload` query param so we can replay credentials later
* since this was encrypted with the systems persistent password and salt.
* @param {RepoLoader} repo
* @param {import("@langchain/core/documents").Document} doc
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
* @returns {string}
*/
function generateChunkSource(repo, doc, encryptionWorker) {
const payload = {
owner: repo.author,
project: repo.project,
branch: repo.branch,
path: doc.metadata.source,
pat: !!repo.accessToken ? repo.accessToken : null,
};
return `github://${repo.repo}?payload=${encryptionWorker.encrypt(
JSON.stringify(payload)
)}`;
}
module.exports = { loadGithubRepo, fetchGithubFile };