mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-16 11:20:10 +01:00
42235fcd8a
* Add support for GitLab repo collection as well as Github Repo collection * Refactor for repo collectors to be more compact --------- Co-authored-by: Emil Rofors <emirof@gmail.com>
160 lines
4.4 KiB
JavaScript
160 lines
4.4 KiB
JavaScript
const RepoLoader = require("./RepoLoader");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const { default: slugify } = require("slugify");
|
|
const { v4 } = require("uuid");
|
|
const { writeToServerDocuments } = require("../../../files");
|
|
const { tokenizeString } = require("../../../tokenizer");
|
|
|
|
/**
|
|
* Load in a Github Repo recursively or just the top level if no PAT is provided
|
|
* @param {object} args - forwarded request body params
|
|
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
|
|
* @returns
|
|
*/
|
|
async function loadGithubRepo(args, response) {
|
|
const repo = new RepoLoader(args);
|
|
await repo.init();
|
|
|
|
if (!repo.ready)
|
|
return {
|
|
success: false,
|
|
reason: "Could not prepare Github repo for loading! Check URL",
|
|
};
|
|
|
|
console.log(
|
|
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} --`
|
|
);
|
|
const docs = await repo.recursiveLoader();
|
|
if (!docs.length) {
|
|
return {
|
|
success: false,
|
|
reason: "No files were found for those settings.",
|
|
};
|
|
}
|
|
|
|
console.log(`[Github Loader]: Found ${docs.length} source files. Saving...`);
|
|
const outFolder = slugify(
|
|
`${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
|
|
).toLowerCase();
|
|
|
|
const outFolderPath =
|
|
process.env.NODE_ENV === "development"
|
|
? path.resolve(
|
|
__dirname,
|
|
`../../../../../server/storage/documents/${outFolder}`
|
|
)
|
|
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
|
|
|
|
if (!fs.existsSync(outFolderPath))
|
|
fs.mkdirSync(outFolderPath, { recursive: true });
|
|
|
|
for (const doc of docs) {
|
|
if (!doc.pageContent) continue;
|
|
const data = {
|
|
id: v4(),
|
|
url: "github://" + doc.metadata.source,
|
|
title: doc.metadata.source,
|
|
docAuthor: repo.author,
|
|
description: "No description found.",
|
|
docSource: doc.metadata.source,
|
|
chunkSource: generateChunkSource(
|
|
repo,
|
|
doc,
|
|
response.locals.encryptionWorker
|
|
),
|
|
published: new Date().toLocaleString(),
|
|
wordCount: doc.pageContent.split(" ").length,
|
|
pageContent: doc.pageContent,
|
|
token_count_estimate: tokenizeString(doc.pageContent).length,
|
|
};
|
|
console.log(
|
|
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
|
|
);
|
|
writeToServerDocuments(
|
|
data,
|
|
`${slugify(doc.metadata.source)}-${data.id}`,
|
|
outFolderPath
|
|
);
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
data: {
|
|
author: repo.author,
|
|
repo: repo.project,
|
|
branch: repo.branch,
|
|
files: docs.length,
|
|
destination: outFolder,
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Gets the page content from a specific source file in a give Github Repo, not all items in a repo.
|
|
* @returns
|
|
*/
|
|
async function fetchGithubFile({
|
|
repoUrl,
|
|
branch,
|
|
accessToken = null,
|
|
sourceFilePath,
|
|
}) {
|
|
const repo = new RepoLoader({
|
|
repo: repoUrl,
|
|
branch,
|
|
accessToken,
|
|
});
|
|
await repo.init();
|
|
|
|
if (!repo.ready)
|
|
return {
|
|
success: false,
|
|
content: null,
|
|
reason: "Could not prepare Github repo for loading! Check URL or PAT.",
|
|
};
|
|
|
|
console.log(
|
|
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} file:${sourceFilePath} --`
|
|
);
|
|
const fileContent = await repo.fetchSingleFile(sourceFilePath);
|
|
if (!fileContent) {
|
|
return {
|
|
success: false,
|
|
reason: "Target file returned a null content response.",
|
|
content: null,
|
|
};
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
content: fileContent,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Generate the full chunkSource for a specific file so that we can resync it later.
|
|
* This data is encrypted into a single `payload` query param so we can replay credentials later
|
|
* since this was encrypted with the systems persistent password and salt.
|
|
* @param {RepoLoader} repo
|
|
* @param {import("@langchain/core/documents").Document} doc
|
|
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
|
|
* @returns {string}
|
|
*/
|
|
function generateChunkSource(repo, doc, encryptionWorker) {
|
|
const payload = {
|
|
owner: repo.author,
|
|
project: repo.project,
|
|
branch: repo.branch,
|
|
path: doc.metadata.source,
|
|
pat: !!repo.accessToken ? repo.accessToken : null,
|
|
};
|
|
return `github://${repo.repo}?payload=${encryptionWorker.encrypt(
|
|
JSON.stringify(payload)
|
|
)}`;
|
|
}
|
|
|
|
module.exports = { loadGithubRepo, fetchGithubFile };
|