anything-llm/collector/utils/extensions/GithubRepo/index.js
2024-05-02 14:03:10 -07:00

85 lines
2.4 KiB
JavaScript

const RepoLoader = require("./RepoLoader");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
async function loadGithubRepo(args) {
const repo = new RepoLoader(args);
await repo.init();
if (!repo.ready)
return {
success: false,
reason: "Could not prepare Github repo for loading! Check URL",
};
console.log(
`-- Working Github ${repo.author}/${repo.project}:${repo.branch} --`
);
const docs = await repo.recursiveLoader();
if (!docs.length) {
return {
success: false,
reason: "No files were found for those settings.",
};
}
console.log(`[Github Loader]: Found ${docs.length} source files. Saving...`);
const outFolder = slugify(
`${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
).toLowerCase();
const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });
for (const doc of docs) {
if (!doc.pageContent) continue;
const data = {
id: v4(),
url: "github://" + doc.metadata.source,
title: doc.metadata.source,
docAuthor: repo.author,
description: "No description found.",
docSource: doc.metadata.source,
chunkSource: `link://${doc.metadata.repository}/blob/${doc.metadata.branch}/${doc.metadata.source}`,
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
);
writeToServerDocuments(
data,
`${slugify(doc.metadata.source)}-${data.id}`,
outFolderPath
);
}
return {
success: true,
reason: null,
data: {
author: repo.author,
repo: repo.project,
branch: repo.branch,
files: docs.length,
destination: outFolder,
},
};
}
module.exports = loadGithubRepo;