anything-llm/collector/utils/extensions/GithubRepo/RepoLoader/index.js

186 lines
5.4 KiB
JavaScript
Raw Normal View History

class RepoLoader {
constructor(args = {}) {
this.ready = false;
this.repo = args?.repo;
this.branch = args?.branch;
this.accessToken = args?.accessToken || null;
this.ignorePaths = args?.ignorePaths || [];
this.author = null;
this.project = null;
this.branches = [];
}
#validGithubUrl() {
const UrlPattern = require("url-pattern");
const pattern = new UrlPattern(
"https\\://github.com/(:author)/(:project(*))",
{
// fixes project names with special characters (.github)
segmentValueCharset: "a-zA-Z0-9-._~%/+",
}
);
const match = pattern.match(this.repo);
if (!match) return false;
this.author = match.author;
this.project = match.project;
return true;
}
// Ensure the branch provided actually exists
// and if it does not or has not been set auto-assign to primary branch.
async #validBranch() {
await this.getRepoBranches();
if (!!this.branch && this.branches.includes(this.branch)) return;
console.log(
"[Github Loader]: Branch not set! Auto-assigning to a default branch."
);
this.branch = this.branches.includes("main") ? "main" : "master";
console.log(`[Github Loader]: Branch auto-assigned to ${this.branch}.`);
return;
}
async #validateAccessToken() {
if (!this.accessToken) return;
const valid = await fetch("https://api.github.com/octocat", {
method: "GET",
headers: {
Authorization: `Bearer ${this.accessToken}`,
"X-GitHub-Api-Version": "2022-11-28",
},
})
.then((res) => {
if (!res.ok) throw new Error(res.statusText);
return res.ok;
})
.catch((e) => {
console.error(
"Invalid Github Access Token provided! Access token will not be used",
e.message
);
return false;
});
if (!valid) this.accessToken = null;
return;
}
async init() {
if (!this.#validGithubUrl()) return;
await this.#validBranch();
await this.#validateAccessToken();
this.ready = true;
return this;
}
async recursiveLoader() {
if (!this.ready) throw new Error("[Github Loader]: not in ready state!");
const {
GithubRepoLoader: LCGithubLoader,
} = require("langchain/document_loaders/web/github");
if (this.accessToken)
console.log(
`[Github Loader]: Access token set! Recursive loading enabled!`
);
const loader = new LCGithubLoader(this.repo, {
accessToken: this.accessToken,
branch: this.branch,
recursive: !!this.accessToken, // Recursive will hit rate limits.
maxConcurrency: 5,
unknown: "ignore",
ignorePaths: this.ignorePaths,
});
const docs = [];
for await (const doc of loader.loadAsStream()) docs.push(doc);
return docs;
}
// Sort branches to always show either main or master at the top of the result.
#branchPrefSort(branches = []) {
const preferredSort = ["main", "master"];
return branches.reduce((acc, branch) => {
if (preferredSort.includes(branch)) return [branch, ...acc];
return [...acc, branch];
}, []);
}
// Get all branches for a given repo.
async getRepoBranches() {
if (!this.#validGithubUrl() || !this.author || !this.project) return [];
await this.#validateAccessToken(); // Ensure API access token is valid for pre-flight
let page = 0;
let polling = true;
const branches = [];
while (polling) {
console.log(`Fetching page ${page} of branches for ${this.project}`);
await fetch(
`https://api.github.com/repos/${this.author}/${this.project}/branches?per_page=100&page=${page}`,
{
method: "GET",
headers: {
...(this.accessToken
? { Authorization: `Bearer ${this.accessToken}` }
: {}),
"X-GitHub-Api-Version": "2022-11-28",
},
}
)
.then((res) => {
if (res.ok) return res.json();
throw new Error(`Invalid request to Github API: ${res.statusText}`);
})
.then((branchObjects) => {
polling = branchObjects.length > 0;
branches.push(branchObjects.map((branch) => branch.name));
page++;
})
.catch((err) => {
polling = false;
console.log(`RepoLoader.branches`, err);
});
}
this.branches = [...new Set(branches.flat())];
return this.#branchPrefSort(this.branches);
}
[BETA] Live document sync (#1719) * wip bg workers for live document sync * Add ability to re-embed specific documents across many workspaces via background queue bgworkser is gated behind expieremental system setting flag that needs to be explictly enabled UI for watching/unwatching docments that are embedded. TODO: UI to easily manage all bg tasks and see run results TODO: UI to enable this feature and background endpoints to manage it * create frontend views and paths Move elements to correct experimental scope * update migration to delete runs on removal of watched document * Add watch support to YouTube transcripts (#1716) * Add watch support to YouTube transcripts refactor how sync is done for supported types * Watch specific files in Confluence space (#1718) Add failure-prune check for runs * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * create tmp workflow modifications for beta image * dual build update copy of alert modals * update job interval * Add support for live-sync of Github files * update copy for document sync feature * hide Experimental features from UI * update docs links * [FEAT] Implement new settings menu for experimental features (#1735) * implement new settings menu for experimental features * remove unused context save bar --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * dont run job on boot * unset workflow changes * Add persistent encryption service Relay key to collector so persistent encryption can be used Encrypt any private data in chunkSources used for replay during resync jobs * update jsDOC * Linting and organization * update modal copy for feature --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2024-06-21 22:38:50 +02:00
async fetchSingleFile(sourceFilePath) {
try {
return fetch(
`https://api.github.com/repos/${this.author}/${this.project}/contents/${sourceFilePath}?ref=${this.branch}`,
{
method: "GET",
headers: {
Accept: "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
...(!!this.accessToken
? { Authorization: `Bearer ${this.accessToken}` }
: {}),
},
}
)
.then((res) => {
if (res.ok) return res.json();
throw new Error(`Failed to fetch from Github API: ${res.statusText}`);
})
.then((json) => {
if (json.hasOwnProperty("status") || !json.hasOwnProperty("content"))
throw new Error(json?.message || "missing content");
return atob(json.content);
});
} catch (e) {
console.error(`RepoLoader.fetchSingleFile`, e);
return null;
}
}
}
module.exports = RepoLoader;