Added an option to fetch issues from gitlab. Made the file fetching a… (#2335)

* Added an option to fetch issues from gitlab. Made the file fetching asynchornous to improve performance. #2334

* Fixed a typo in loadGitlabRepo.

* Convert issues to markdown.

* Fixed an issue with time estimate field names in issueToMarkdown.

* handle rate limits more gracefully + update checkbox to toggle switch

* lint

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
Blazej Owczarczyk 2024-09-26 20:45:18 +02:00 committed by GitHub
parent 961b567541
commit b2123b13b0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 325 additions and 107 deletions

View File

@ -6,6 +6,7 @@ const minimatch = require("minimatch");
* @property {string} [branch] - The branch to load from (optional).
* @property {string} [accessToken] - GitLab access token for authentication (optional).
* @property {string[]} [ignorePaths] - Array of paths to ignore when loading (optional).
* @property {boolean} [fetchIssues] - Should issues be fetched (optional).
*/
/**
@ -33,6 +34,7 @@ class GitLabRepoLoader {
this.branch = args?.branch;
this.accessToken = args?.accessToken || null;
this.ignorePaths = args?.ignorePaths || [];
this.withIssues = args?.fetchIssues || false;
this.projectId = null;
this.apiBase = "https://gitlab.com";
@ -123,22 +125,44 @@ class GitLabRepoLoader {
if (this.accessToken)
console.log(
`[Gitlab Loader]: Access token set! Recursive loading enabled!`
`[Gitlab Loader]: Access token set! Recursive loading enabled for ${this.repo}!`
);
const files = await this.fetchFilesRecursive();
const docs = [];
console.log(`[Gitlab Loader]: Fetching files.`);
const files = await this.fetchFilesRecursive();
console.log(`[Gitlab Loader]: Fetched ${files.length} files.`);
for (const file of files) {
if (this.ignorePaths.some((path) => file.path.includes(path))) continue;
const content = await this.fetchSingleFileContents(file.path);
if (content) {
docs.push({
pageContent: content,
metadata: { source: file.path },
});
}
docs.push({
pageContent: file.content,
metadata: {
source: file.path,
url: `${this.repo}/-/blob/${this.branch}/${file.path}`,
},
});
}
if (this.withIssues) {
console.log(`[Gitlab Loader]: Fetching issues.`);
const issues = await this.fetchIssues();
console.log(
`[Gitlab Loader]: Fetched ${issues.length} issues with discussions.`
);
docs.push(
...issues.map((issue) => ({
issue,
metadata: {
source: `issue-${this.repo}-${issue.iid}`,
url: issue.web_url,
},
}))
);
}
return docs;
@ -160,51 +184,14 @@ class GitLabRepoLoader {
if (!this.#validGitlabUrl() || !this.projectId) return [];
await this.#validateAccessToken();
this.branches = [];
let fetching = true;
let page = 1;
let perPage = 50;
while (fetching) {
try {
const params = new URLSearchParams({
per_page: perPage,
page,
});
const response = await fetch(
`${this.apiBase}/api/v4/projects/${
this.projectId
}/repository/branches?${params.toString()}`,
{
method: "GET",
headers: {
Accepts: "application/json",
...(this.accessToken
? { "PRIVATE-TOKEN": this.accessToken }
: {}),
},
}
)
.then((res) => res.json())
.then((branches) => {
if (!Array.isArray(branches) || branches.length === 0) {
fetching = false;
return [];
}
return branches.map((b) => b.name);
})
.catch((e) => {
console.error(e);
fetching = false;
return [];
});
const branchesRequestData = {
endpoint: `/api/v4/projects/${this.projectId}/repository/branches`,
};
this.branches.push(...response);
page++;
} catch (err) {
console.log(`RepoLoader.getRepoBranches`, err);
fetching = false;
return [];
}
let branchesPage = [];
while ((branchesPage = await this.fetchNextPage(branchesRequestData))) {
this.branches.push(...branchesPage.map((branch) => branch.name));
}
return this.#branchPrefSort(this.branches);
}
@ -215,62 +202,96 @@ class GitLabRepoLoader {
*/
async fetchFilesRecursive() {
const files = [];
let perPage = 100;
let fetching = true;
let page = 1;
const filesRequestData = {
endpoint: `/api/v4/projects/${this.projectId}/repository/tree`,
queryParams: {
ref: this.branch,
recursive: true,
},
};
while (fetching) {
try {
const params = new URLSearchParams({
ref: this.branch,
recursive: true,
per_page: perPage,
page,
});
const queryUrl = `${this.apiBase}/api/v4/projects/${
this.projectId
}/repository/tree?${params.toString()}`;
const response = await fetch(queryUrl, {
method: "GET",
headers: this.accessToken
? { "PRIVATE-TOKEN": this.accessToken }
: {},
});
const totalPages = Number(response.headers.get("x-total-pages"));
const nextPage = Number(response.headers.get("x-next-page"));
const data = await response.json();
/** @type {FileTreeObject[]} */
const objects = Array.isArray(data)
? data.filter((item) => item.type === "blob")
: []; // only get files, not paths or submodules
// Apply ignore path rules to found objects. If any rules match it is an invalid file path.
console.log(
`Found ${objects.length} blobs from repo from pg ${page}/${totalPages}`
);
for (const file of objects) {
let filesPage = null;
let pagePromises = [];
while ((filesPage = await this.fetchNextPage(filesRequestData))) {
// Fetch all the files that are not ignored in parallel.
pagePromises = filesPage
.filter((file) => {
if (file.type !== "blob") {
return false;
}
const isIgnored = this.ignorePaths.some((ignorePattern) =>
minimatch(file.path, ignorePattern, { matchBase: true })
);
if (!isIgnored) files.push(file);
}
return !isIgnored;
})
.map(async (file) => {
const content = await this.fetchSingleFileContents(file.path);
if (!content) return null;
return {
path: file.path,
content,
};
});
if (page === totalPages) {
fetching = false;
break;
}
const pageFiles = await Promise.all(pagePromises);
page = Number(nextPage);
} catch (e) {
console.error(`RepoLoader.getRepositoryTree`, e);
fetching = false;
break;
}
files.push(...pageFiles.filter((item) => item !== null));
console.log(`Fetched ${files.length} files.`);
}
console.log(`Total files fetched: ${files.length}`);
return files;
}
/**
* Fetches all issues from the repository.
* @returns {Promise<Issue[]>} An array of issue objects.
*/
async fetchIssues() {
const issues = [];
const issuesRequestData = {
endpoint: `/api/v4/projects/${this.projectId}/issues`,
};
let issuesPage = null;
let pagePromises = [];
while ((issuesPage = await this.fetchNextPage(issuesRequestData))) {
// Fetch all the issues in parallel.
pagePromises = issuesPage.map(async (issue) => {
const discussionsRequestData = {
endpoint: `/api/v4/projects/${this.projectId}/issues/${issue.iid}/discussions`,
};
let discussionPage = null;
const discussions = [];
while (
(discussionPage = await this.fetchNextPage(discussionsRequestData))
) {
discussions.push(
...discussionPage.map(({ notes }) =>
notes.map(
({ body, author, created_at }) =>
`${author.username} at ${created_at}:
${body}`
)
)
);
}
const result = {
...issue,
discussions,
};
return result;
});
const pageIssues = await Promise.all(pagePromises);
issues.push(...pageIssues);
console.log(`Fetched ${issues.length} issues.`);
}
console.log(`Total issues fetched: ${issues.length}`);
return issues;
}
/**
* Fetches the content of a single file from the repository.
* @param {string} sourceFilePath - The path to the file in the repository.
@ -301,6 +322,59 @@ class GitLabRepoLoader {
return null;
}
}
/**
* Fetches the next page of data from the API.
* @param {Object} requestData - The request data.
* @returns {Promise<Array<Object>|null>} The next page of data, or null if no more pages.
*/
async fetchNextPage(requestData) {
try {
if (requestData.page === -1) return null;
if (!requestData.page) requestData.page = 1;
const { endpoint, perPage = 100, queryParams = {} } = requestData;
const params = new URLSearchParams({
...queryParams,
per_page: perPage,
page: requestData.page,
});
const url = `${this.apiBase}${endpoint}?${params.toString()}`;
const response = await fetch(url, {
method: "GET",
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
});
// Rate limits get hit very often if no PAT is provided
if (response.status === 401) {
console.warn(`Rate limit hit for ${endpoint}. Skipping.`);
return null;
}
const totalPages = Number(response.headers.get("x-total-pages"));
const data = await response.json();
if (!Array.isArray(data)) {
console.warn(`Unexpected response format for ${endpoint}:`, data);
return [];
}
console.log(
`Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}/${totalPages} with ${data.length} records.`
);
if (totalPages === requestData.page) {
requestData.page = -1;
} else {
requestData.page = Number(response.headers.get("x-next-page"));
}
return data;
} catch (e) {
console.error(`RepoLoader.fetchNextPage`, e);
return null;
}
}
}
module.exports = GitLabRepoLoader;

View File

@ -50,13 +50,12 @@ async function loadGitlabRepo(args, response) {
fs.mkdirSync(outFolderPath, { recursive: true });
for (const doc of docs) {
if (!doc.pageContent) continue;
if (!doc.metadata || (!doc.pageContent && !doc.issue)) continue;
let pageContent = null;
const data = {
id: v4(),
url: "gitlab://" + doc.metadata.source,
title: doc.metadata.source,
docAuthor: repo.author,
description: "No description found.",
docSource: doc.metadata.source,
chunkSource: generateChunkSource(
repo,
@ -64,13 +63,32 @@ async function loadGitlabRepo(args, response) {
response.locals.encryptionWorker
),
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
};
if (doc.pageContent) {
pageContent = doc.pageContent;
data.title = doc.metadata.source;
data.docAuthor = repo.author;
data.description = "No description found.";
} else if (doc.issue) {
pageContent = issueToMarkdown(doc.issue);
data.title = `Issue ${doc.issue.iid}: ${doc.issue.title}`;
data.docAuthor = doc.issue.author.username;
data.description = doc.issue.description;
} else {
continue;
}
data.wordCount = pageContent.split(" ").length;
data.token_count_estimate = tokenizeString(pageContent).length;
data.pageContent = pageContent;
console.log(
`[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}`
);
writeToServerDocuments(
data,
`${slugify(doc.metadata.source)}-${data.id}`,
@ -142,4 +160,93 @@ function generateChunkSource(repo, doc, encryptionWorker) {
)}`;
}
function issueToMarkdown(issue) {
const metadata = {};
const userFields = ["author", "assignees", "closed_by"];
const userToUsername = ({ username }) => username;
for (const userField of userFields) {
if (issue[userField]) {
if (Array.isArray(issue[userField])) {
metadata[userField] = issue[userField].map(userToUsername);
} else {
metadata[userField] = userToUsername(issue[userField]);
}
}
}
const singleValueFields = [
"web_url",
"state",
"created_at",
"updated_at",
"closed_at",
"due_date",
"type",
"merge_request_count",
"upvotes",
"downvotes",
"labels",
"has_tasks",
"task_status",
"confidential",
"severity",
];
for (const singleValueField of singleValueFields) {
metadata[singleValueField] = issue[singleValueField];
}
if (issue.milestone) {
metadata.milestone = `${issue.milestone.title} (${issue.milestone.id})`;
}
if (issue.time_stats) {
const timeFields = ["time_estimate", "total_time_spent"];
for (const timeField of timeFields) {
const fieldName = `human_${timeField}`;
if (issue?.time_stats[fieldName]) {
metadata[timeField] = issue.time_stats[fieldName];
}
}
}
const metadataString = Object.entries(metadata)
.map(([name, value]) => {
if (!value || value?.length < 1) {
return null;
}
let result = `- ${name.replace("_", " ")}:`;
if (!Array.isArray(value)) {
result += ` ${value}`;
} else {
result += "\n" + value.map((s) => ` - ${s}`).join("\n");
}
return result;
})
.filter((item) => item != null)
.join("\n");
let markdown = `# ${issue.title} (${issue.iid})
${issue.description}
## Metadata
${metadataString}`;
if (issue.discussions.length > 0) {
markdown += `
## Activity
${issue.discussions.join("\n\n")}
`;
}
return markdown;
}
module.exports = { loadGitlabRepo, fetchGitlabFile };

View File

@ -34,6 +34,7 @@ export default function GitlabOptions() {
accessToken: form.get("accessToken"),
branch: form.get("branch"),
ignorePaths: ignores,
fetchIssues: form.get("fetchIssues"),
});
if (!!error) {
@ -112,6 +113,30 @@ export default function GitlabOptions() {
onBlur={() => setSettings({ ...settings, accessToken })}
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white font-bold text-sm flex gap-x-2 items-center">
<p className="font-bold text-white">Settings</p>{" "}
</label>
<p className="text-xs font-normal text-white/50">
Select additional entities to fetch from the GitLab API.
</p>
</div>
<div className="flex items-center gap-x-2">
<label className="relative inline-flex cursor-pointer items-center">
<input
type="checkbox"
name="fetchIssues"
value={true}
className="peer sr-only"
/>
<div className="pointer-events-none peer h-6 w-11 rounded-full bg-stone-400 after:absolute after:left-[2px] after:top-[2px] after:h-5 after:w-5 after:rounded-full after:shadow-xl after:border after:border-gray-600 after:bg-white after:box-shadow-md after:transition-all after:content-[''] peer-checked:bg-lime-300 peer-checked:after:translate-x-full peer-checked:after:border-white peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-blue-800"></div>
<span className="ml-3 text-sm font-medium text-white">
Fetch Issues as Documents
</span>
</label>
</div>
</div>
<GitLabBranchSelection
repo={settings.repo}
accessToken={settings.accessToken}

View File

@ -64,11 +64,23 @@ const DataConnector = {
return { branches: [], error: e.message };
});
},
collect: async function ({ repo, accessToken, branch, ignorePaths = [] }) {
collect: async function ({
repo,
accessToken,
branch,
ignorePaths = [],
fetchIssues = false,
}) {
return await fetch(`${API_BASE}/ext/gitlab/repo`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({ repo, accessToken, branch, ignorePaths }),
body: JSON.stringify({
repo,
accessToken,
branch,
ignorePaths,
fetchIssues,
}),
})
.then((res) => res.json())
.then((res) => {