From 30645831a16d7cb308ac462dfb91aa5b70b5e01f Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Thu, 26 Sep 2024 12:50:35 -0700 Subject: [PATCH] 1959 filetype filters (#2378) * Updated the `GitHubRepoLoader` class to use the new import syntax and adjust the `recursiveLoader` method accordingly. * add @langchain/community to collector package.json * fix: Improve handling of complex ignore patterns in GitLabRepoLoader * refactor: use ignore package for simplified ignore logic * run yarn lint * add @langchain/community@^0.2.23 * remove unused dep lint --------- Co-authored-by: Emil Rofors (aider) --- collector/package.json | 2 +- .../RepoLoader/GithubRepo/RepoLoader/index.js | 9 +- .../RepoLoader/GitlabRepo/RepoLoader/index.js | 14 +- collector/yarn.lock | 123 ++++++++++++++++-- 4 files changed, 119 insertions(+), 29 deletions(-) diff --git a/collector/package.json b/collector/package.json index 217c52857..4ce85e68e 100644 --- a/collector/package.json +++ b/collector/package.json @@ -15,6 +15,7 @@ "lint": "yarn prettier --ignore-path ../.prettierignore --write ./processSingleFile ./processLink ./utils index.js" }, "dependencies": { + "@langchain/community": "^0.2.23", "@xenova/transformers": "^2.11.0", "bcrypt": "^5.1.0", "body-parser": "^1.20.2", @@ -30,7 +31,6 @@ "mammoth": "^1.6.0", "mbox-parser": "^1.0.1", "mime": "^3.0.0", - "minimatch": "5.1.0", "moment": "^2.29.4", "node-html-parser": "^6.1.13", "officeparser": "^4.0.5", diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js index 935cab197..61f208742 100644 --- a/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js @@ -105,7 +105,7 @@ class GitHubRepoLoader { if (!this.ready) throw new Error("[Github Loader]: not in ready state!"); const { GithubRepoLoader: LCGithubLoader, - } = require("langchain/document_loaders/web/github"); + } = require("@langchain/community/document_loaders/web/github"); if (this.accessToken) console.log( @@ -113,17 +113,16 @@ class GitHubRepoLoader { ); const loader = new LCGithubLoader(this.repo, { - accessToken: this.accessToken, branch: this.branch, recursive: !!this.accessToken, // Recursive will hit rate limits. maxConcurrency: 5, - unknown: "ignore", + unknown: "warn", + accessToken: this.accessToken, ignorePaths: this.ignorePaths, verbose: true, }); - const docs = []; - for await (const doc of loader.loadAsStream()) docs.push(doc); + const docs = await loader.load(); return docs; } diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js index 9ebc3c0db..79afd5a96 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js @@ -1,4 +1,4 @@ -const minimatch = require("minimatch"); +const ignore = require("ignore"); /** * @typedef {Object} RepoLoaderArgs @@ -34,6 +34,7 @@ class GitLabRepoLoader { this.branch = args?.branch; this.accessToken = args?.accessToken || null; this.ignorePaths = args?.ignorePaths || []; + this.ignoreFilter = ignore().add(this.ignorePaths); this.withIssues = args?.fetchIssues || false; this.projectId = null; @@ -137,7 +138,7 @@ class GitLabRepoLoader { console.log(`[Gitlab Loader]: Fetched ${files.length} files.`); for (const file of files) { - if (this.ignorePaths.some((path) => file.path.includes(path))) continue; + if (this.ignoreFilter.ignores(file.path)) continue; docs.push({ pageContent: file.content, @@ -216,13 +217,8 @@ class GitLabRepoLoader { // Fetch all the files that are not ignored in parallel. pagePromises = filesPage .filter((file) => { - if (file.type !== "blob") { - return false; - } - const isIgnored = this.ignorePaths.some((ignorePattern) => - minimatch(file.path, ignorePattern, { matchBase: true }) - ); - return !isIgnored; + if (file.type !== "blob") return false; + return !this.ignoreFilter.ignores(file.path); }) .map(async (file) => { const content = await this.fetchSingleFileContents(file.path); diff --git a/collector/yarn.lock b/collector/yarn.lock index 6f8b49c74..2786692e0 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -64,6 +64,23 @@ resolved "https://registry.yarnpkg.com/@huggingface/jinja/-/jinja-0.2.2.tgz#faeb205a9d6995089bef52655ddd8245d3190627" integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA== +"@langchain/community@^0.2.23": + version "0.2.23" + resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.2.23.tgz#20560e107bcc8432c42e499f1b9292d41a3732f2" + integrity sha512-p1n/zZ1F+O5l51RzeoUeJyhpzq6Wp11tkqKOj8oThKOQJgLhO7q6iFIvmKThzL7mZCNNuPM5r1OPnU4wO6iF/A== + dependencies: + "@langchain/core" ">=0.2.16 <0.3.0" + "@langchain/openai" ">=0.1.0 <0.3.0" + binary-extensions "^2.2.0" + expr-eval "^2.0.2" + flat "^5.0.2" + js-yaml "^4.1.0" + langchain "~0.2.3" + langsmith "~0.1.30" + uuid "^10.0.0" + zod "^3.22.3" + zod-to-json-schema "^3.22.5" + "@langchain/community@~0.0.47": version "0.0.53" resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.53.tgz#a9aaedffa0ed2977e8d302d74e9f90a49a6da037" @@ -78,6 +95,23 @@ zod "^3.22.3" zod-to-json-schema "^3.22.5" +"@langchain/core@>=0.2.11 <0.3.0", "@langchain/core@>=0.2.16 <0.3.0": + version "0.2.20" + resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.2.20.tgz#5115781b0a86db3ce4b697e473405892c09621ca" + integrity sha512-WPBjrzOj79/yqjloDUIw1GDhuRQfHis07TyyDj+qS81nHh0svSasetKcqAZ3L5JoPcBmEL7rRBtM+OcyC3mLVg== + dependencies: + ansi-styles "^5.0.0" + camelcase "6" + decamelize "1.2.0" + js-tiktoken "^1.0.12" + langsmith "~0.1.39" + mustache "^4.2.0" + p-queue "^6.6.2" + p-retry "4" + uuid "^10.0.0" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + "@langchain/core@~0.1", "@langchain/core@~0.1.56", "@langchain/core@~0.1.60": version "0.1.61" resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736" @@ -96,6 +130,17 @@ zod "^3.22.4" zod-to-json-schema "^3.22.3" +"@langchain/openai@>=0.1.0 <0.3.0": + version "0.2.5" + resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.2.5.tgz#e85b983986a7415ea743d4c854bb0674134334d4" + integrity sha512-gQXS5VBFyAco0jgSnUVan6fYVSIxlffmDaeDGpXrAmz2nQPgiN/h24KYOt2NOZ1zRheRzRuO/CfRagMhyVUaFA== + dependencies: + "@langchain/core" ">=0.2.16 <0.3.0" + js-tiktoken "^1.0.12" + openai "^4.49.1" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + "@langchain/openai@~0.0.28": version "0.0.28" resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.28.tgz#afaeec61b44816935db9ae937496c964c81ab571" @@ -559,13 +604,6 @@ brace-expansion@^1.1.7: balanced-match "^1.0.0" concat-map "0.0.1" -brace-expansion@^2.0.1: - version "2.0.1" - resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-2.0.1.tgz#1edc459e0f0c548486ecf9fc99f2221364b9a0ae" - integrity sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA== - dependencies: - balanced-match "^1.0.0" - braces@~3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107" @@ -1769,6 +1807,13 @@ js-tiktoken@^1.0.11, js-tiktoken@^1.0.7, js-tiktoken@^1.0.8: dependencies: base64-js "^1.5.1" +js-tiktoken@^1.0.12: + version "1.0.12" + resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.12.tgz#af0f5cf58e5e7318240d050c8413234019424211" + integrity sha512-L7wURW1fH9Qaext0VzaUDpFGVQgjkdE3Dgsy9/+yXyGEpBKnylTd0mU0bfbNkKDlXRb6TEsZkwuflu1B8uQbJQ== + dependencies: + base64-js "^1.5.1" + js-tokens@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499" @@ -1844,6 +1889,28 @@ langchain@0.1.36: zod "^3.22.4" zod-to-json-schema "^3.22.3" +langchain@~0.2.3: + version "0.2.12" + resolved "https://registry.yarnpkg.com/langchain/-/langchain-0.2.12.tgz#3fac0b9519a070689b6dd679d5854abc57824dcf" + integrity sha512-ZHtJrHUpridZ7IQu7N/wAQ6iMAAO7VLzkupHqKP79S6p+alrPbn1BjRnh+PeGm92YiY5DafTCuvchmujxx7bCQ== + dependencies: + "@langchain/core" ">=0.2.11 <0.3.0" + "@langchain/openai" ">=0.1.0 <0.3.0" + "@langchain/textsplitters" "~0.0.0" + binary-extensions "^2.2.0" + js-tiktoken "^1.0.12" + js-yaml "^4.1.0" + jsonpointer "^5.0.1" + langchainhub "~0.0.8" + langsmith "~0.1.30" + ml-distance "^4.0.0" + openapi-types "^12.1.3" + p-retry "4" + uuid "^10.0.0" + yaml "^2.2.1" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + langchainhub@~0.0.8: version "0.0.8" resolved "https://registry.yarnpkg.com/langchainhub/-/langchainhub-0.0.8.tgz#fd4b96dc795e22e36c1a20bad31b61b0c33d3110" @@ -1860,6 +1927,18 @@ langsmith@~0.1.1, langsmith@~0.1.7: p-retry "4" uuid "^9.0.0" +langsmith@~0.1.30, langsmith@~0.1.39: + version "0.1.40" + resolved "https://registry.yarnpkg.com/langsmith/-/langsmith-0.1.40.tgz#9708889386a5b9d0eb43dd3a9eba93513b57101d" + integrity sha512-11E2WLbh/+41+Qc0w8fJJTC/iz91BA+zXRMX/Wz0KSstnfzIPBoiWa++Kp2X8yCIDNywWWLJhy/B8gYzm7VKig== + dependencies: + "@types/uuid" "^9.0.1" + commander "^10.0.1" + p-queue "^6.6.2" + p-retry "4" + semver "^7.6.3" + uuid "^9.0.0" + leac@^0.6.0: version "0.6.0" resolved "https://registry.yarnpkg.com/leac/-/leac-0.6.0.tgz#dcf136e382e666bd2475f44a1096061b70dc0912" @@ -2082,13 +2161,6 @@ mimic-response@^3.1.0: resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ== -minimatch@5.1.0: - version "5.1.0" - resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-5.1.0.tgz#1717b464f4971b144f6aabe8f2d0b8e4511e09c7" - integrity sha512-9TPBGGak4nHfGZsPBohm9AWg6NoT7QTCehS3BIJABslyZbzxfV78QM2Y6+i741OPZIafFAaiiEMh5OyIrJPgtg== - dependencies: - brace-expansion "^2.0.1" - minimatch@^3.1.1, minimatch@^3.1.2: version "3.1.2" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" @@ -2417,6 +2489,19 @@ openai@^4.32.1: node-fetch "^2.6.7" web-streams-polyfill "^3.2.1" +openai@^4.49.1: + version "4.54.0" + resolved "https://registry.yarnpkg.com/openai/-/openai-4.54.0.tgz#eeb209c6892b997e524181b6ddb7e27bf4d09389" + integrity sha512-e/12BdtTtj+tXs7iHm+Dm7H7WjEWnw7O52B2wSfCQ6lD5F6cvjzo7cANXy5TJ1Q3/qc8YRPT5wBTTFtP5sBp1g== + dependencies: + "@types/node" "^18.11.18" + "@types/node-fetch" "^2.6.4" + abort-controller "^3.0.0" + agentkeepalive "^4.2.1" + form-data-encoder "1.7.2" + formdata-node "^4.3.2" + node-fetch "^2.6.7" + openapi-types@^12.1.3: version "12.1.3" resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3" @@ -2863,6 +2948,11 @@ semver@^7.3.5, semver@^7.5.4: dependencies: lru-cache "^6.0.0" +semver@^7.6.3: + version "7.6.3" + resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.3.tgz#980f7b5550bc175fb4dc09403085627f9eb33143" + integrity sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A== + semver@~7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/semver/-/semver-7.0.0.tgz#5f3ca35761e47e05b206c6daff2cf814f0316b8e" @@ -3336,6 +3426,11 @@ utils-merge@1.0.1: resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713" integrity sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA== +uuid@^10.0.0: + version "10.0.0" + resolved "https://registry.yarnpkg.com/uuid/-/uuid-10.0.0.tgz#5a95aa454e6e002725c79055fd42aaba30ca6294" + integrity sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ== + uuid@^9.0.0: version "9.0.1" resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30"