From b2123b13b0038320e2aea40f9edc732df61fe25d Mon Sep 17 00:00:00 2001 From: Blazej Owczarczyk Date: Thu, 26 Sep 2024 20:45:18 +0200 Subject: [PATCH] =?UTF-8?q?Added=20an=20option=20to=20fetch=20issues=20fro?= =?UTF-8?q?m=20gitlab.=20Made=20the=20file=20fetching=20a=E2=80=A6=20(#233?= =?UTF-8?q?5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added an option to fetch issues from gitlab. Made the file fetching asynchornous to improve performance. #2334 * Fixed a typo in loadGitlabRepo. * Convert issues to markdown. * Fixed an issue with time estimate field names in issueToMarkdown. * handle rate limits more gracefully + update checkbox to toggle switch * lint --------- Co-authored-by: Timothy Carambat Co-authored-by: shatfield4 --- .../RepoLoader/GitlabRepo/RepoLoader/index.js | 270 +++++++++++------- .../extensions/RepoLoader/GitlabRepo/index.js | 121 +++++++- .../Connectors/Gitlab/index.jsx | 25 ++ frontend/src/models/dataConnector.js | 16 +- 4 files changed, 325 insertions(+), 107 deletions(-) diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js index 755832298..9ebc3c0db 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js @@ -6,6 +6,7 @@ const minimatch = require("minimatch"); * @property {string} [branch] - The branch to load from (optional). * @property {string} [accessToken] - GitLab access token for authentication (optional). * @property {string[]} [ignorePaths] - Array of paths to ignore when loading (optional). + * @property {boolean} [fetchIssues] - Should issues be fetched (optional). */ /** @@ -33,6 +34,7 @@ class GitLabRepoLoader { this.branch = args?.branch; this.accessToken = args?.accessToken || null; this.ignorePaths = args?.ignorePaths || []; + this.withIssues = args?.fetchIssues || false; this.projectId = null; this.apiBase = "https://gitlab.com"; @@ -123,22 +125,44 @@ class GitLabRepoLoader { if (this.accessToken) console.log( - `[Gitlab Loader]: Access token set! Recursive loading enabled!` + `[Gitlab Loader]: Access token set! Recursive loading enabled for ${this.repo}!` ); - const files = await this.fetchFilesRecursive(); const docs = []; + console.log(`[Gitlab Loader]: Fetching files.`); + + const files = await this.fetchFilesRecursive(); + + console.log(`[Gitlab Loader]: Fetched ${files.length} files.`); + for (const file of files) { if (this.ignorePaths.some((path) => file.path.includes(path))) continue; - const content = await this.fetchSingleFileContents(file.path); - if (content) { - docs.push({ - pageContent: content, - metadata: { source: file.path }, - }); - } + docs.push({ + pageContent: file.content, + metadata: { + source: file.path, + url: `${this.repo}/-/blob/${this.branch}/${file.path}`, + }, + }); + } + + if (this.withIssues) { + console.log(`[Gitlab Loader]: Fetching issues.`); + const issues = await this.fetchIssues(); + console.log( + `[Gitlab Loader]: Fetched ${issues.length} issues with discussions.` + ); + docs.push( + ...issues.map((issue) => ({ + issue, + metadata: { + source: `issue-${this.repo}-${issue.iid}`, + url: issue.web_url, + }, + })) + ); } return docs; @@ -160,51 +184,14 @@ class GitLabRepoLoader { if (!this.#validGitlabUrl() || !this.projectId) return []; await this.#validateAccessToken(); this.branches = []; - let fetching = true; - let page = 1; - let perPage = 50; - while (fetching) { - try { - const params = new URLSearchParams({ - per_page: perPage, - page, - }); - const response = await fetch( - `${this.apiBase}/api/v4/projects/${ - this.projectId - }/repository/branches?${params.toString()}`, - { - method: "GET", - headers: { - Accepts: "application/json", - ...(this.accessToken - ? { "PRIVATE-TOKEN": this.accessToken } - : {}), - }, - } - ) - .then((res) => res.json()) - .then((branches) => { - if (!Array.isArray(branches) || branches.length === 0) { - fetching = false; - return []; - } - return branches.map((b) => b.name); - }) - .catch((e) => { - console.error(e); - fetching = false; - return []; - }); + const branchesRequestData = { + endpoint: `/api/v4/projects/${this.projectId}/repository/branches`, + }; - this.branches.push(...response); - page++; - } catch (err) { - console.log(`RepoLoader.getRepoBranches`, err); - fetching = false; - return []; - } + let branchesPage = []; + while ((branchesPage = await this.fetchNextPage(branchesRequestData))) { + this.branches.push(...branchesPage.map((branch) => branch.name)); } return this.#branchPrefSort(this.branches); } @@ -215,62 +202,96 @@ class GitLabRepoLoader { */ async fetchFilesRecursive() { const files = []; - let perPage = 100; - let fetching = true; - let page = 1; + const filesRequestData = { + endpoint: `/api/v4/projects/${this.projectId}/repository/tree`, + queryParams: { + ref: this.branch, + recursive: true, + }, + }; - while (fetching) { - try { - const params = new URLSearchParams({ - ref: this.branch, - recursive: true, - per_page: perPage, - page, - }); - const queryUrl = `${this.apiBase}/api/v4/projects/${ - this.projectId - }/repository/tree?${params.toString()}`; - const response = await fetch(queryUrl, { - method: "GET", - headers: this.accessToken - ? { "PRIVATE-TOKEN": this.accessToken } - : {}, - }); - const totalPages = Number(response.headers.get("x-total-pages")); - const nextPage = Number(response.headers.get("x-next-page")); - const data = await response.json(); - - /** @type {FileTreeObject[]} */ - const objects = Array.isArray(data) - ? data.filter((item) => item.type === "blob") - : []; // only get files, not paths or submodules - - // Apply ignore path rules to found objects. If any rules match it is an invalid file path. - console.log( - `Found ${objects.length} blobs from repo from pg ${page}/${totalPages}` - ); - for (const file of objects) { + let filesPage = null; + let pagePromises = []; + while ((filesPage = await this.fetchNextPage(filesRequestData))) { + // Fetch all the files that are not ignored in parallel. + pagePromises = filesPage + .filter((file) => { + if (file.type !== "blob") { + return false; + } const isIgnored = this.ignorePaths.some((ignorePattern) => minimatch(file.path, ignorePattern, { matchBase: true }) ); - if (!isIgnored) files.push(file); - } + return !isIgnored; + }) + .map(async (file) => { + const content = await this.fetchSingleFileContents(file.path); + if (!content) return null; + return { + path: file.path, + content, + }; + }); - if (page === totalPages) { - fetching = false; - break; - } + const pageFiles = await Promise.all(pagePromises); - page = Number(nextPage); - } catch (e) { - console.error(`RepoLoader.getRepositoryTree`, e); - fetching = false; - break; - } + files.push(...pageFiles.filter((item) => item !== null)); + console.log(`Fetched ${files.length} files.`); } + console.log(`Total files fetched: ${files.length}`); return files; } + /** + * Fetches all issues from the repository. + * @returns {Promise} An array of issue objects. + */ + async fetchIssues() { + const issues = []; + const issuesRequestData = { + endpoint: `/api/v4/projects/${this.projectId}/issues`, + }; + + let issuesPage = null; + let pagePromises = []; + while ((issuesPage = await this.fetchNextPage(issuesRequestData))) { + // Fetch all the issues in parallel. + pagePromises = issuesPage.map(async (issue) => { + const discussionsRequestData = { + endpoint: `/api/v4/projects/${this.projectId}/issues/${issue.iid}/discussions`, + }; + let discussionPage = null; + const discussions = []; + + while ( + (discussionPage = await this.fetchNextPage(discussionsRequestData)) + ) { + discussions.push( + ...discussionPage.map(({ notes }) => + notes.map( + ({ body, author, created_at }) => + `${author.username} at ${created_at}: +${body}` + ) + ) + ); + } + const result = { + ...issue, + discussions, + }; + return result; + }); + + const pageIssues = await Promise.all(pagePromises); + + issues.push(...pageIssues); + console.log(`Fetched ${issues.length} issues.`); + } + console.log(`Total issues fetched: ${issues.length}`); + return issues; + } + /** * Fetches the content of a single file from the repository. * @param {string} sourceFilePath - The path to the file in the repository. @@ -301,6 +322,59 @@ class GitLabRepoLoader { return null; } } + + /** + * Fetches the next page of data from the API. + * @param {Object} requestData - The request data. + * @returns {Promise|null>} The next page of data, or null if no more pages. + */ + async fetchNextPage(requestData) { + try { + if (requestData.page === -1) return null; + if (!requestData.page) requestData.page = 1; + + const { endpoint, perPage = 100, queryParams = {} } = requestData; + const params = new URLSearchParams({ + ...queryParams, + per_page: perPage, + page: requestData.page, + }); + const url = `${this.apiBase}${endpoint}?${params.toString()}`; + + const response = await fetch(url, { + method: "GET", + headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {}, + }); + + // Rate limits get hit very often if no PAT is provided + if (response.status === 401) { + console.warn(`Rate limit hit for ${endpoint}. Skipping.`); + return null; + } + + const totalPages = Number(response.headers.get("x-total-pages")); + const data = await response.json(); + if (!Array.isArray(data)) { + console.warn(`Unexpected response format for ${endpoint}:`, data); + return []; + } + + console.log( + `Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}/${totalPages} with ${data.length} records.` + ); + + if (totalPages === requestData.page) { + requestData.page = -1; + } else { + requestData.page = Number(response.headers.get("x-next-page")); + } + + return data; + } catch (e) { + console.error(`RepoLoader.fetchNextPage`, e); + return null; + } + } } module.exports = GitLabRepoLoader; diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js index e756463c7..f1c528f1d 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js @@ -50,13 +50,12 @@ async function loadGitlabRepo(args, response) { fs.mkdirSync(outFolderPath, { recursive: true }); for (const doc of docs) { - if (!doc.pageContent) continue; + if (!doc.metadata || (!doc.pageContent && !doc.issue)) continue; + let pageContent = null; + const data = { id: v4(), url: "gitlab://" + doc.metadata.source, - title: doc.metadata.source, - docAuthor: repo.author, - description: "No description found.", docSource: doc.metadata.source, chunkSource: generateChunkSource( repo, @@ -64,13 +63,32 @@ async function loadGitlabRepo(args, response) { response.locals.encryptionWorker ), published: new Date().toLocaleString(), - wordCount: doc.pageContent.split(" ").length, - pageContent: doc.pageContent, - token_count_estimate: tokenizeString(doc.pageContent).length, }; + + if (doc.pageContent) { + pageContent = doc.pageContent; + + data.title = doc.metadata.source; + data.docAuthor = repo.author; + data.description = "No description found."; + } else if (doc.issue) { + pageContent = issueToMarkdown(doc.issue); + + data.title = `Issue ${doc.issue.iid}: ${doc.issue.title}`; + data.docAuthor = doc.issue.author.username; + data.description = doc.issue.description; + } else { + continue; + } + + data.wordCount = pageContent.split(" ").length; + data.token_count_estimate = tokenizeString(pageContent).length; + data.pageContent = pageContent; + console.log( `[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}` ); + writeToServerDocuments( data, `${slugify(doc.metadata.source)}-${data.id}`, @@ -142,4 +160,93 @@ function generateChunkSource(repo, doc, encryptionWorker) { )}`; } +function issueToMarkdown(issue) { + const metadata = {}; + + const userFields = ["author", "assignees", "closed_by"]; + const userToUsername = ({ username }) => username; + for (const userField of userFields) { + if (issue[userField]) { + if (Array.isArray(issue[userField])) { + metadata[userField] = issue[userField].map(userToUsername); + } else { + metadata[userField] = userToUsername(issue[userField]); + } + } + } + + const singleValueFields = [ + "web_url", + "state", + "created_at", + "updated_at", + "closed_at", + "due_date", + "type", + "merge_request_count", + "upvotes", + "downvotes", + "labels", + "has_tasks", + "task_status", + "confidential", + "severity", + ]; + + for (const singleValueField of singleValueFields) { + metadata[singleValueField] = issue[singleValueField]; + } + + if (issue.milestone) { + metadata.milestone = `${issue.milestone.title} (${issue.milestone.id})`; + } + + if (issue.time_stats) { + const timeFields = ["time_estimate", "total_time_spent"]; + for (const timeField of timeFields) { + const fieldName = `human_${timeField}`; + if (issue?.time_stats[fieldName]) { + metadata[timeField] = issue.time_stats[fieldName]; + } + } + } + + const metadataString = Object.entries(metadata) + .map(([name, value]) => { + if (!value || value?.length < 1) { + return null; + } + let result = `- ${name.replace("_", " ")}:`; + + if (!Array.isArray(value)) { + result += ` ${value}`; + } else { + result += "\n" + value.map((s) => ` - ${s}`).join("\n"); + } + + return result; + }) + .filter((item) => item != null) + .join("\n"); + + let markdown = `# ${issue.title} (${issue.iid}) + +${issue.description} + +## Metadata + +${metadataString}`; + + if (issue.discussions.length > 0) { + markdown += ` + +## Activity + +${issue.discussions.join("\n\n")} +`; + } + + return markdown; +} + module.exports = { loadGitlabRepo, fetchGitlabFile }; diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx index f3c34dc8a..265f2fe4b 100644 --- a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx +++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx @@ -34,6 +34,7 @@ export default function GitlabOptions() { accessToken: form.get("accessToken"), branch: form.get("branch"), ignorePaths: ignores, + fetchIssues: form.get("fetchIssues"), }); if (!!error) { @@ -112,6 +113,30 @@ export default function GitlabOptions() { onBlur={() => setSettings({ ...settings, accessToken })} /> +
+
+ +

+ Select additional entities to fetch from the GitLab API. +

+
+
+ +
+
res.json()) .then((res) => {