anything-llm/collector/utils/extensions/RepoLoader/GithubRepo/index.js
Timothy Carambat 42235fcd8a
GitLab Hosted and Local Connector (#1932)
* Add support for GitLab repo collection as well as Github Repo collection
* Refactor for repo collectors to be more compact

---------

Co-authored-by: Emil Rofors <emirof@gmail.com>
2024-07-23 12:23:51 -07:00

160 lines
4.4 KiB
JavaScript

const RepoLoader = require("./RepoLoader");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../../files");
const { tokenizeString } = require("../../../tokenizer");
/**
* Load in a Github Repo recursively or just the top level if no PAT is provided
* @param {object} args - forwarded request body params
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns
*/
async function loadGithubRepo(args, response) {
const repo = new RepoLoader(args);
await repo.init();
if (!repo.ready)
return {
success: false,
reason: "Could not prepare Github repo for loading! Check URL",
};
console.log(
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} --`
);
const docs = await repo.recursiveLoader();
if (!docs.length) {
return {
success: false,
reason: "No files were found for those settings.",
};
}
console.log(`[Github Loader]: Found ${docs.length} source files. Saving...`);
const outFolder = slugify(
`${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
).toLowerCase();
const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });
for (const doc of docs) {
if (!doc.pageContent) continue;
const data = {
id: v4(),
url: "github://" + doc.metadata.source,
title: doc.metadata.source,
docAuthor: repo.author,
description: "No description found.",
docSource: doc.metadata.source,
chunkSource: generateChunkSource(
repo,
doc,
response.locals.encryptionWorker
),
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
);
writeToServerDocuments(
data,
`${slugify(doc.metadata.source)}-${data.id}`,
outFolderPath
);
}
return {
success: true,
reason: null,
data: {
author: repo.author,
repo: repo.project,
branch: repo.branch,
files: docs.length,
destination: outFolder,
},
};
}
/**
* Gets the page content from a specific source file in a give Github Repo, not all items in a repo.
* @returns
*/
async function fetchGithubFile({
repoUrl,
branch,
accessToken = null,
sourceFilePath,
}) {
const repo = new RepoLoader({
repo: repoUrl,
branch,
accessToken,
});
await repo.init();
if (!repo.ready)
return {
success: false,
content: null,
reason: "Could not prepare Github repo for loading! Check URL or PAT.",
};
console.log(
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} file:${sourceFilePath} --`
);
const fileContent = await repo.fetchSingleFile(sourceFilePath);
if (!fileContent) {
return {
success: false,
reason: "Target file returned a null content response.",
content: null,
};
}
return {
success: true,
reason: null,
content: fileContent,
};
}
/**
* Generate the full chunkSource for a specific file so that we can resync it later.
* This data is encrypted into a single `payload` query param so we can replay credentials later
* since this was encrypted with the systems persistent password and salt.
* @param {RepoLoader} repo
* @param {import("@langchain/core/documents").Document} doc
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
* @returns {string}
*/
function generateChunkSource(repo, doc, encryptionWorker) {
const payload = {
owner: repo.author,
project: repo.project,
branch: repo.branch,
path: doc.metadata.source,
pat: !!repo.accessToken ? repo.accessToken : null,
};
return `github://${repo.repo}?payload=${encryptionWorker.encrypt(
JSON.stringify(payload)
)}`;
}
module.exports = { loadGithubRepo, fetchGithubFile };