anything-llm/collector/utils/extensions/GithubRepo/RepoLoader/index.js

150 lines
4.3 KiB
JavaScript
Raw Normal View History

class RepoLoader {
constructor(args = {}) {
this.ready = false;
this.repo = args?.repo;
this.branch = args?.branch;
this.accessToken = args?.accessToken || null;
this.ignorePaths = args?.ignorePaths || [];
this.author = null;
this.project = null;
this.branches = [];
}
#validGithubUrl() {
const UrlPattern = require("url-pattern");
const pattern = new UrlPattern("https\\://github.com/(:author)/(:project)");
const match = pattern.match(this.repo);
if (!match) return false;
this.author = match.author;
this.project = match.project;
return true;
}
// Ensure the branch provided actually exists
// and if it does not or has not been set auto-assign to primary branch.
async #validBranch() {
await this.getRepoBranches();
if (!!this.branch && this.branches.includes(this.branch)) return;
console.log(
"[Github Loader]: Branch not set! Auto-assigning to a default branch."
);
this.branch = this.branches.includes("main") ? "main" : "master";
console.log(`[Github Loader]: Branch auto-assigned to ${this.branch}.`);
return;
}
async #validateAccessToken() {
if (!this.accessToken) return;
const valid = await fetch("https://api.github.com/octocat", {
method: "GET",
headers: {
Authorization: `Bearer ${this.accessToken}`,
"X-GitHub-Api-Version": "2022-11-28",
},
})
.then((res) => {
if (!res.ok) throw new Error(res.statusText);
return res.ok;
})
.catch((e) => {
console.error(
"Invalid Github Access Token provided! Access token will not be used",
e.message
);
return false;
});
if (!valid) this.accessToken = null;
return;
}
async init() {
if (!this.#validGithubUrl()) return;
await this.#validBranch();
await this.#validateAccessToken();
this.ready = true;
return this;
}
async recursiveLoader() {
if (!this.ready) throw new Error("[Github Loader]: not in ready state!");
const {
GithubRepoLoader: LCGithubLoader,
} = require("langchain/document_loaders/web/github");
if (this.accessToken)
console.log(
`[Github Loader]: Access token set! Recursive loading enabled!`
);
const loader = new LCGithubLoader(this.repo, {
accessToken: this.accessToken,
branch: this.branch,
recursive: !!this.accessToken, // Recursive will hit rate limits.
maxConcurrency: 5,
unknown: "ignore",
ignorePaths: this.ignorePaths,
});
const docs = [];
for await (const doc of loader.loadAsStream()) docs.push(doc);
return docs;
}
// Sort branches to always show either main or master at the top of the result.
#branchPrefSort(branches = []) {
const preferredSort = ["main", "master"];
return branches.reduce((acc, branch) => {
if (preferredSort.includes(branch)) return [branch, ...acc];
return [...acc, branch];
}, []);
}
// Get all branches for a given repo.
async getRepoBranches() {
if (!this.#validGithubUrl() || !this.author || !this.project) return [];
await this.#validateAccessToken(); // Ensure API access token is valid for pre-flight
let page = 0;
let polling = true;
const branches = [];
while (polling) {
console.log(`Fetching page ${page} of branches for ${this.project}`);
await fetch(
`https://api.github.com/repos/${this.author}/${this.project}/branches?per_page=100&page=${page}`,
{
method: "GET",
headers: {
...(this.accessToken
? { Authorization: `Bearer ${this.accessToken}` }
: {}),
"X-GitHub-Api-Version": "2022-11-28",
},
}
)
.then((res) => {
if (res.ok) return res.json();
throw new Error(`Invalid request to Github API: ${res.statusText}`);
})
.then((branchObjects) => {
polling = branchObjects.length > 0;
branches.push(branchObjects.map((branch) => branch.name));
page++;
})
.catch((err) => {
polling = false;
console.log(`RepoLoader.branches`, err);
});
}
this.branches = [...new Set(branches.flat())];
return this.#branchPrefSort(this.branches);
}
}
module.exports = RepoLoader;