mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 04:30:10 +01:00
Added an option to fetch issues from gitlab. Made the file fetching a… (#2335)
* Added an option to fetch issues from gitlab. Made the file fetching asynchornous to improve performance. #2334 * Fixed a typo in loadGitlabRepo. * Convert issues to markdown. * Fixed an issue with time estimate field names in issueToMarkdown. * handle rate limits more gracefully + update checkbox to toggle switch * lint --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
parent
961b567541
commit
b2123b13b0
@ -6,6 +6,7 @@ const minimatch = require("minimatch");
|
||||
* @property {string} [branch] - The branch to load from (optional).
|
||||
* @property {string} [accessToken] - GitLab access token for authentication (optional).
|
||||
* @property {string[]} [ignorePaths] - Array of paths to ignore when loading (optional).
|
||||
* @property {boolean} [fetchIssues] - Should issues be fetched (optional).
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -33,6 +34,7 @@ class GitLabRepoLoader {
|
||||
this.branch = args?.branch;
|
||||
this.accessToken = args?.accessToken || null;
|
||||
this.ignorePaths = args?.ignorePaths || [];
|
||||
this.withIssues = args?.fetchIssues || false;
|
||||
|
||||
this.projectId = null;
|
||||
this.apiBase = "https://gitlab.com";
|
||||
@ -123,22 +125,44 @@ class GitLabRepoLoader {
|
||||
|
||||
if (this.accessToken)
|
||||
console.log(
|
||||
`[Gitlab Loader]: Access token set! Recursive loading enabled!`
|
||||
`[Gitlab Loader]: Access token set! Recursive loading enabled for ${this.repo}!`
|
||||
);
|
||||
|
||||
const files = await this.fetchFilesRecursive();
|
||||
const docs = [];
|
||||
|
||||
console.log(`[Gitlab Loader]: Fetching files.`);
|
||||
|
||||
const files = await this.fetchFilesRecursive();
|
||||
|
||||
console.log(`[Gitlab Loader]: Fetched ${files.length} files.`);
|
||||
|
||||
for (const file of files) {
|
||||
if (this.ignorePaths.some((path) => file.path.includes(path))) continue;
|
||||
|
||||
const content = await this.fetchSingleFileContents(file.path);
|
||||
if (content) {
|
||||
docs.push({
|
||||
pageContent: content,
|
||||
metadata: { source: file.path },
|
||||
pageContent: file.content,
|
||||
metadata: {
|
||||
source: file.path,
|
||||
url: `${this.repo}/-/blob/${this.branch}/${file.path}`,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
if (this.withIssues) {
|
||||
console.log(`[Gitlab Loader]: Fetching issues.`);
|
||||
const issues = await this.fetchIssues();
|
||||
console.log(
|
||||
`[Gitlab Loader]: Fetched ${issues.length} issues with discussions.`
|
||||
);
|
||||
docs.push(
|
||||
...issues.map((issue) => ({
|
||||
issue,
|
||||
metadata: {
|
||||
source: `issue-${this.repo}-${issue.iid}`,
|
||||
url: issue.web_url,
|
||||
},
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
return docs;
|
||||
@ -160,51 +184,14 @@ class GitLabRepoLoader {
|
||||
if (!this.#validGitlabUrl() || !this.projectId) return [];
|
||||
await this.#validateAccessToken();
|
||||
this.branches = [];
|
||||
let fetching = true;
|
||||
let page = 1;
|
||||
let perPage = 50;
|
||||
|
||||
while (fetching) {
|
||||
try {
|
||||
const params = new URLSearchParams({
|
||||
per_page: perPage,
|
||||
page,
|
||||
});
|
||||
const response = await fetch(
|
||||
`${this.apiBase}/api/v4/projects/${
|
||||
this.projectId
|
||||
}/repository/branches?${params.toString()}`,
|
||||
{
|
||||
method: "GET",
|
||||
headers: {
|
||||
Accepts: "application/json",
|
||||
...(this.accessToken
|
||||
? { "PRIVATE-TOKEN": this.accessToken }
|
||||
: {}),
|
||||
},
|
||||
}
|
||||
)
|
||||
.then((res) => res.json())
|
||||
.then((branches) => {
|
||||
if (!Array.isArray(branches) || branches.length === 0) {
|
||||
fetching = false;
|
||||
return [];
|
||||
}
|
||||
return branches.map((b) => b.name);
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error(e);
|
||||
fetching = false;
|
||||
return [];
|
||||
});
|
||||
const branchesRequestData = {
|
||||
endpoint: `/api/v4/projects/${this.projectId}/repository/branches`,
|
||||
};
|
||||
|
||||
this.branches.push(...response);
|
||||
page++;
|
||||
} catch (err) {
|
||||
console.log(`RepoLoader.getRepoBranches`, err);
|
||||
fetching = false;
|
||||
return [];
|
||||
}
|
||||
let branchesPage = [];
|
||||
while ((branchesPage = await this.fetchNextPage(branchesRequestData))) {
|
||||
this.branches.push(...branchesPage.map((branch) => branch.name));
|
||||
}
|
||||
return this.#branchPrefSort(this.branches);
|
||||
}
|
||||
@ -215,62 +202,96 @@ class GitLabRepoLoader {
|
||||
*/
|
||||
async fetchFilesRecursive() {
|
||||
const files = [];
|
||||
let perPage = 100;
|
||||
let fetching = true;
|
||||
let page = 1;
|
||||
|
||||
while (fetching) {
|
||||
try {
|
||||
const params = new URLSearchParams({
|
||||
const filesRequestData = {
|
||||
endpoint: `/api/v4/projects/${this.projectId}/repository/tree`,
|
||||
queryParams: {
|
||||
ref: this.branch,
|
||||
recursive: true,
|
||||
per_page: perPage,
|
||||
page,
|
||||
});
|
||||
const queryUrl = `${this.apiBase}/api/v4/projects/${
|
||||
this.projectId
|
||||
}/repository/tree?${params.toString()}`;
|
||||
const response = await fetch(queryUrl, {
|
||||
method: "GET",
|
||||
headers: this.accessToken
|
||||
? { "PRIVATE-TOKEN": this.accessToken }
|
||||
: {},
|
||||
});
|
||||
const totalPages = Number(response.headers.get("x-total-pages"));
|
||||
const nextPage = Number(response.headers.get("x-next-page"));
|
||||
const data = await response.json();
|
||||
},
|
||||
};
|
||||
|
||||
/** @type {FileTreeObject[]} */
|
||||
const objects = Array.isArray(data)
|
||||
? data.filter((item) => item.type === "blob")
|
||||
: []; // only get files, not paths or submodules
|
||||
|
||||
// Apply ignore path rules to found objects. If any rules match it is an invalid file path.
|
||||
console.log(
|
||||
`Found ${objects.length} blobs from repo from pg ${page}/${totalPages}`
|
||||
);
|
||||
for (const file of objects) {
|
||||
let filesPage = null;
|
||||
let pagePromises = [];
|
||||
while ((filesPage = await this.fetchNextPage(filesRequestData))) {
|
||||
// Fetch all the files that are not ignored in parallel.
|
||||
pagePromises = filesPage
|
||||
.filter((file) => {
|
||||
if (file.type !== "blob") {
|
||||
return false;
|
||||
}
|
||||
const isIgnored = this.ignorePaths.some((ignorePattern) =>
|
||||
minimatch(file.path, ignorePattern, { matchBase: true })
|
||||
);
|
||||
if (!isIgnored) files.push(file);
|
||||
}
|
||||
return !isIgnored;
|
||||
})
|
||||
.map(async (file) => {
|
||||
const content = await this.fetchSingleFileContents(file.path);
|
||||
if (!content) return null;
|
||||
return {
|
||||
path: file.path,
|
||||
content,
|
||||
};
|
||||
});
|
||||
|
||||
if (page === totalPages) {
|
||||
fetching = false;
|
||||
break;
|
||||
}
|
||||
const pageFiles = await Promise.all(pagePromises);
|
||||
|
||||
page = Number(nextPage);
|
||||
} catch (e) {
|
||||
console.error(`RepoLoader.getRepositoryTree`, e);
|
||||
fetching = false;
|
||||
break;
|
||||
}
|
||||
files.push(...pageFiles.filter((item) => item !== null));
|
||||
console.log(`Fetched ${files.length} files.`);
|
||||
}
|
||||
console.log(`Total files fetched: ${files.length}`);
|
||||
return files;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches all issues from the repository.
|
||||
* @returns {Promise<Issue[]>} An array of issue objects.
|
||||
*/
|
||||
async fetchIssues() {
|
||||
const issues = [];
|
||||
const issuesRequestData = {
|
||||
endpoint: `/api/v4/projects/${this.projectId}/issues`,
|
||||
};
|
||||
|
||||
let issuesPage = null;
|
||||
let pagePromises = [];
|
||||
while ((issuesPage = await this.fetchNextPage(issuesRequestData))) {
|
||||
// Fetch all the issues in parallel.
|
||||
pagePromises = issuesPage.map(async (issue) => {
|
||||
const discussionsRequestData = {
|
||||
endpoint: `/api/v4/projects/${this.projectId}/issues/${issue.iid}/discussions`,
|
||||
};
|
||||
let discussionPage = null;
|
||||
const discussions = [];
|
||||
|
||||
while (
|
||||
(discussionPage = await this.fetchNextPage(discussionsRequestData))
|
||||
) {
|
||||
discussions.push(
|
||||
...discussionPage.map(({ notes }) =>
|
||||
notes.map(
|
||||
({ body, author, created_at }) =>
|
||||
`${author.username} at ${created_at}:
|
||||
${body}`
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
const result = {
|
||||
...issue,
|
||||
discussions,
|
||||
};
|
||||
return result;
|
||||
});
|
||||
|
||||
const pageIssues = await Promise.all(pagePromises);
|
||||
|
||||
issues.push(...pageIssues);
|
||||
console.log(`Fetched ${issues.length} issues.`);
|
||||
}
|
||||
console.log(`Total issues fetched: ${issues.length}`);
|
||||
return issues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the content of a single file from the repository.
|
||||
* @param {string} sourceFilePath - The path to the file in the repository.
|
||||
@ -301,6 +322,59 @@ class GitLabRepoLoader {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the next page of data from the API.
|
||||
* @param {Object} requestData - The request data.
|
||||
* @returns {Promise<Array<Object>|null>} The next page of data, or null if no more pages.
|
||||
*/
|
||||
async fetchNextPage(requestData) {
|
||||
try {
|
||||
if (requestData.page === -1) return null;
|
||||
if (!requestData.page) requestData.page = 1;
|
||||
|
||||
const { endpoint, perPage = 100, queryParams = {} } = requestData;
|
||||
const params = new URLSearchParams({
|
||||
...queryParams,
|
||||
per_page: perPage,
|
||||
page: requestData.page,
|
||||
});
|
||||
const url = `${this.apiBase}${endpoint}?${params.toString()}`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: "GET",
|
||||
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
|
||||
});
|
||||
|
||||
// Rate limits get hit very often if no PAT is provided
|
||||
if (response.status === 401) {
|
||||
console.warn(`Rate limit hit for ${endpoint}. Skipping.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const totalPages = Number(response.headers.get("x-total-pages"));
|
||||
const data = await response.json();
|
||||
if (!Array.isArray(data)) {
|
||||
console.warn(`Unexpected response format for ${endpoint}:`, data);
|
||||
return [];
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}/${totalPages} with ${data.length} records.`
|
||||
);
|
||||
|
||||
if (totalPages === requestData.page) {
|
||||
requestData.page = -1;
|
||||
} else {
|
||||
requestData.page = Number(response.headers.get("x-next-page"));
|
||||
}
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error(`RepoLoader.fetchNextPage`, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = GitLabRepoLoader;
|
||||
|
@ -50,13 +50,12 @@ async function loadGitlabRepo(args, response) {
|
||||
fs.mkdirSync(outFolderPath, { recursive: true });
|
||||
|
||||
for (const doc of docs) {
|
||||
if (!doc.pageContent) continue;
|
||||
if (!doc.metadata || (!doc.pageContent && !doc.issue)) continue;
|
||||
let pageContent = null;
|
||||
|
||||
const data = {
|
||||
id: v4(),
|
||||
url: "gitlab://" + doc.metadata.source,
|
||||
title: doc.metadata.source,
|
||||
docAuthor: repo.author,
|
||||
description: "No description found.",
|
||||
docSource: doc.metadata.source,
|
||||
chunkSource: generateChunkSource(
|
||||
repo,
|
||||
@ -64,13 +63,32 @@ async function loadGitlabRepo(args, response) {
|
||||
response.locals.encryptionWorker
|
||||
),
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: doc.pageContent.split(" ").length,
|
||||
pageContent: doc.pageContent,
|
||||
token_count_estimate: tokenizeString(doc.pageContent).length,
|
||||
};
|
||||
|
||||
if (doc.pageContent) {
|
||||
pageContent = doc.pageContent;
|
||||
|
||||
data.title = doc.metadata.source;
|
||||
data.docAuthor = repo.author;
|
||||
data.description = "No description found.";
|
||||
} else if (doc.issue) {
|
||||
pageContent = issueToMarkdown(doc.issue);
|
||||
|
||||
data.title = `Issue ${doc.issue.iid}: ${doc.issue.title}`;
|
||||
data.docAuthor = doc.issue.author.username;
|
||||
data.description = doc.issue.description;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
data.wordCount = pageContent.split(" ").length;
|
||||
data.token_count_estimate = tokenizeString(pageContent).length;
|
||||
data.pageContent = pageContent;
|
||||
|
||||
console.log(
|
||||
`[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}`
|
||||
);
|
||||
|
||||
writeToServerDocuments(
|
||||
data,
|
||||
`${slugify(doc.metadata.source)}-${data.id}`,
|
||||
@ -142,4 +160,93 @@ function generateChunkSource(repo, doc, encryptionWorker) {
|
||||
)}`;
|
||||
}
|
||||
|
||||
function issueToMarkdown(issue) {
|
||||
const metadata = {};
|
||||
|
||||
const userFields = ["author", "assignees", "closed_by"];
|
||||
const userToUsername = ({ username }) => username;
|
||||
for (const userField of userFields) {
|
||||
if (issue[userField]) {
|
||||
if (Array.isArray(issue[userField])) {
|
||||
metadata[userField] = issue[userField].map(userToUsername);
|
||||
} else {
|
||||
metadata[userField] = userToUsername(issue[userField]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const singleValueFields = [
|
||||
"web_url",
|
||||
"state",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"closed_at",
|
||||
"due_date",
|
||||
"type",
|
||||
"merge_request_count",
|
||||
"upvotes",
|
||||
"downvotes",
|
||||
"labels",
|
||||
"has_tasks",
|
||||
"task_status",
|
||||
"confidential",
|
||||
"severity",
|
||||
];
|
||||
|
||||
for (const singleValueField of singleValueFields) {
|
||||
metadata[singleValueField] = issue[singleValueField];
|
||||
}
|
||||
|
||||
if (issue.milestone) {
|
||||
metadata.milestone = `${issue.milestone.title} (${issue.milestone.id})`;
|
||||
}
|
||||
|
||||
if (issue.time_stats) {
|
||||
const timeFields = ["time_estimate", "total_time_spent"];
|
||||
for (const timeField of timeFields) {
|
||||
const fieldName = `human_${timeField}`;
|
||||
if (issue?.time_stats[fieldName]) {
|
||||
metadata[timeField] = issue.time_stats[fieldName];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const metadataString = Object.entries(metadata)
|
||||
.map(([name, value]) => {
|
||||
if (!value || value?.length < 1) {
|
||||
return null;
|
||||
}
|
||||
let result = `- ${name.replace("_", " ")}:`;
|
||||
|
||||
if (!Array.isArray(value)) {
|
||||
result += ` ${value}`;
|
||||
} else {
|
||||
result += "\n" + value.map((s) => ` - ${s}`).join("\n");
|
||||
}
|
||||
|
||||
return result;
|
||||
})
|
||||
.filter((item) => item != null)
|
||||
.join("\n");
|
||||
|
||||
let markdown = `# ${issue.title} (${issue.iid})
|
||||
|
||||
${issue.description}
|
||||
|
||||
## Metadata
|
||||
|
||||
${metadataString}`;
|
||||
|
||||
if (issue.discussions.length > 0) {
|
||||
markdown += `
|
||||
|
||||
## Activity
|
||||
|
||||
${issue.discussions.join("\n\n")}
|
||||
`;
|
||||
}
|
||||
|
||||
return markdown;
|
||||
}
|
||||
|
||||
module.exports = { loadGitlabRepo, fetchGitlabFile };
|
||||
|
@ -34,6 +34,7 @@ export default function GitlabOptions() {
|
||||
accessToken: form.get("accessToken"),
|
||||
branch: form.get("branch"),
|
||||
ignorePaths: ignores,
|
||||
fetchIssues: form.get("fetchIssues"),
|
||||
});
|
||||
|
||||
if (!!error) {
|
||||
@ -112,6 +113,30 @@ export default function GitlabOptions() {
|
||||
onBlur={() => setSettings({ ...settings, accessToken })}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white font-bold text-sm flex gap-x-2 items-center">
|
||||
<p className="font-bold text-white">Settings</p>{" "}
|
||||
</label>
|
||||
<p className="text-xs font-normal text-white/50">
|
||||
Select additional entities to fetch from the GitLab API.
|
||||
</p>
|
||||
</div>
|
||||
<div className="flex items-center gap-x-2">
|
||||
<label className="relative inline-flex cursor-pointer items-center">
|
||||
<input
|
||||
type="checkbox"
|
||||
name="fetchIssues"
|
||||
value={true}
|
||||
className="peer sr-only"
|
||||
/>
|
||||
<div className="pointer-events-none peer h-6 w-11 rounded-full bg-stone-400 after:absolute after:left-[2px] after:top-[2px] after:h-5 after:w-5 after:rounded-full after:shadow-xl after:border after:border-gray-600 after:bg-white after:box-shadow-md after:transition-all after:content-[''] peer-checked:bg-lime-300 peer-checked:after:translate-x-full peer-checked:after:border-white peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-blue-800"></div>
|
||||
<span className="ml-3 text-sm font-medium text-white">
|
||||
Fetch Issues as Documents
|
||||
</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<GitLabBranchSelection
|
||||
repo={settings.repo}
|
||||
accessToken={settings.accessToken}
|
||||
|
@ -64,11 +64,23 @@ const DataConnector = {
|
||||
return { branches: [], error: e.message };
|
||||
});
|
||||
},
|
||||
collect: async function ({ repo, accessToken, branch, ignorePaths = [] }) {
|
||||
collect: async function ({
|
||||
repo,
|
||||
accessToken,
|
||||
branch,
|
||||
ignorePaths = [],
|
||||
fetchIssues = false,
|
||||
}) {
|
||||
return await fetch(`${API_BASE}/ext/gitlab/repo`, {
|
||||
method: "POST",
|
||||
headers: baseHeaders(),
|
||||
body: JSON.stringify({ repo, accessToken, branch, ignorePaths }),
|
||||
body: JSON.stringify({
|
||||
repo,
|
||||
accessToken,
|
||||
branch,
|
||||
ignorePaths,
|
||||
fetchIssues,
|
||||
}),
|
||||
})
|
||||
.then((res) => res.json())
|
||||
.then((res) => {
|
||||
|
Loading…
Reference in New Issue
Block a user