1959 filetype filters (#2378)

* Updated the `GitHubRepoLoader` class to use the new import syntax and adjust the `recursiveLoader` method accordingly.

* add @langchain/community to collector package.json

* fix: Improve handling of complex ignore patterns in GitLabRepoLoader

* refactor: use ignore package for simplified ignore logic

* run yarn lint

* add @langchain/community@^0.2.23

* remove unused dep
lint

---------

Co-authored-by: Emil Rofors (aider) <emirof@gmail.com>
This commit is contained in:
Timothy Carambat 2024-09-26 12:50:35 -07:00 committed by GitHub
parent b2123b13b0
commit 30645831a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 119 additions and 29 deletions

View File

@ -15,6 +15,7 @@
"lint": "yarn prettier --ignore-path ../.prettierignore --write ./processSingleFile ./processLink ./utils index.js" "lint": "yarn prettier --ignore-path ../.prettierignore --write ./processSingleFile ./processLink ./utils index.js"
}, },
"dependencies": { "dependencies": {
"@langchain/community": "^0.2.23",
"@xenova/transformers": "^2.11.0", "@xenova/transformers": "^2.11.0",
"bcrypt": "^5.1.0", "bcrypt": "^5.1.0",
"body-parser": "^1.20.2", "body-parser": "^1.20.2",
@ -30,7 +31,6 @@
"mammoth": "^1.6.0", "mammoth": "^1.6.0",
"mbox-parser": "^1.0.1", "mbox-parser": "^1.0.1",
"mime": "^3.0.0", "mime": "^3.0.0",
"minimatch": "5.1.0",
"moment": "^2.29.4", "moment": "^2.29.4",
"node-html-parser": "^6.1.13", "node-html-parser": "^6.1.13",
"officeparser": "^4.0.5", "officeparser": "^4.0.5",

View File

@ -105,7 +105,7 @@ class GitHubRepoLoader {
if (!this.ready) throw new Error("[Github Loader]: not in ready state!"); if (!this.ready) throw new Error("[Github Loader]: not in ready state!");
const { const {
GithubRepoLoader: LCGithubLoader, GithubRepoLoader: LCGithubLoader,
} = require("langchain/document_loaders/web/github"); } = require("@langchain/community/document_loaders/web/github");
if (this.accessToken) if (this.accessToken)
console.log( console.log(
@ -113,17 +113,16 @@ class GitHubRepoLoader {
); );
const loader = new LCGithubLoader(this.repo, { const loader = new LCGithubLoader(this.repo, {
accessToken: this.accessToken,
branch: this.branch, branch: this.branch,
recursive: !!this.accessToken, // Recursive will hit rate limits. recursive: !!this.accessToken, // Recursive will hit rate limits.
maxConcurrency: 5, maxConcurrency: 5,
unknown: "ignore", unknown: "warn",
accessToken: this.accessToken,
ignorePaths: this.ignorePaths, ignorePaths: this.ignorePaths,
verbose: true, verbose: true,
}); });
const docs = []; const docs = await loader.load();
for await (const doc of loader.loadAsStream()) docs.push(doc);
return docs; return docs;
} }

View File

@ -1,4 +1,4 @@
const minimatch = require("minimatch"); const ignore = require("ignore");
/** /**
* @typedef {Object} RepoLoaderArgs * @typedef {Object} RepoLoaderArgs
@ -34,6 +34,7 @@ class GitLabRepoLoader {
this.branch = args?.branch; this.branch = args?.branch;
this.accessToken = args?.accessToken || null; this.accessToken = args?.accessToken || null;
this.ignorePaths = args?.ignorePaths || []; this.ignorePaths = args?.ignorePaths || [];
this.ignoreFilter = ignore().add(this.ignorePaths);
this.withIssues = args?.fetchIssues || false; this.withIssues = args?.fetchIssues || false;
this.projectId = null; this.projectId = null;
@ -137,7 +138,7 @@ class GitLabRepoLoader {
console.log(`[Gitlab Loader]: Fetched ${files.length} files.`); console.log(`[Gitlab Loader]: Fetched ${files.length} files.`);
for (const file of files) { for (const file of files) {
if (this.ignorePaths.some((path) => file.path.includes(path))) continue; if (this.ignoreFilter.ignores(file.path)) continue;
docs.push({ docs.push({
pageContent: file.content, pageContent: file.content,
@ -216,13 +217,8 @@ class GitLabRepoLoader {
// Fetch all the files that are not ignored in parallel. // Fetch all the files that are not ignored in parallel.
pagePromises = filesPage pagePromises = filesPage
.filter((file) => { .filter((file) => {
if (file.type !== "blob") { if (file.type !== "blob") return false;
return false; return !this.ignoreFilter.ignores(file.path);
}
const isIgnored = this.ignorePaths.some((ignorePattern) =>
minimatch(file.path, ignorePattern, { matchBase: true })
);
return !isIgnored;
}) })
.map(async (file) => { .map(async (file) => {
const content = await this.fetchSingleFileContents(file.path); const content = await this.fetchSingleFileContents(file.path);

View File

@ -64,6 +64,23 @@
resolved "https://registry.yarnpkg.com/@huggingface/jinja/-/jinja-0.2.2.tgz#faeb205a9d6995089bef52655ddd8245d3190627" resolved "https://registry.yarnpkg.com/@huggingface/jinja/-/jinja-0.2.2.tgz#faeb205a9d6995089bef52655ddd8245d3190627"
integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA== integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==
"@langchain/community@^0.2.23":
version "0.2.23"
resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.2.23.tgz#20560e107bcc8432c42e499f1b9292d41a3732f2"
integrity sha512-p1n/zZ1F+O5l51RzeoUeJyhpzq6Wp11tkqKOj8oThKOQJgLhO7q6iFIvmKThzL7mZCNNuPM5r1OPnU4wO6iF/A==
dependencies:
"@langchain/core" ">=0.2.16 <0.3.0"
"@langchain/openai" ">=0.1.0 <0.3.0"
binary-extensions "^2.2.0"
expr-eval "^2.0.2"
flat "^5.0.2"
js-yaml "^4.1.0"
langchain "~0.2.3"
langsmith "~0.1.30"
uuid "^10.0.0"
zod "^3.22.3"
zod-to-json-schema "^3.22.5"
"@langchain/community@~0.0.47": "@langchain/community@~0.0.47":
version "0.0.53" version "0.0.53"
resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.53.tgz#a9aaedffa0ed2977e8d302d74e9f90a49a6da037" resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.53.tgz#a9aaedffa0ed2977e8d302d74e9f90a49a6da037"
@ -78,6 +95,23 @@
zod "^3.22.3" zod "^3.22.3"
zod-to-json-schema "^3.22.5" zod-to-json-schema "^3.22.5"
"@langchain/core@>=0.2.11 <0.3.0", "@langchain/core@>=0.2.16 <0.3.0":
version "0.2.20"
resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.2.20.tgz#5115781b0a86db3ce4b697e473405892c09621ca"
integrity sha512-WPBjrzOj79/yqjloDUIw1GDhuRQfHis07TyyDj+qS81nHh0svSasetKcqAZ3L5JoPcBmEL7rRBtM+OcyC3mLVg==
dependencies:
ansi-styles "^5.0.0"
camelcase "6"
decamelize "1.2.0"
js-tiktoken "^1.0.12"
langsmith "~0.1.39"
mustache "^4.2.0"
p-queue "^6.6.2"
p-retry "4"
uuid "^10.0.0"
zod "^3.22.4"
zod-to-json-schema "^3.22.3"
"@langchain/core@~0.1", "@langchain/core@~0.1.56", "@langchain/core@~0.1.60": "@langchain/core@~0.1", "@langchain/core@~0.1.56", "@langchain/core@~0.1.60":
version "0.1.61" version "0.1.61"
resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736" resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736"
@ -96,6 +130,17 @@
zod "^3.22.4" zod "^3.22.4"
zod-to-json-schema "^3.22.3" zod-to-json-schema "^3.22.3"
"@langchain/openai@>=0.1.0 <0.3.0":
version "0.2.5"
resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.2.5.tgz#e85b983986a7415ea743d4c854bb0674134334d4"
integrity sha512-gQXS5VBFyAco0jgSnUVan6fYVSIxlffmDaeDGpXrAmz2nQPgiN/h24KYOt2NOZ1zRheRzRuO/CfRagMhyVUaFA==
dependencies:
"@langchain/core" ">=0.2.16 <0.3.0"
js-tiktoken "^1.0.12"
openai "^4.49.1"
zod "^3.22.4"
zod-to-json-schema "^3.22.3"
"@langchain/openai@~0.0.28": "@langchain/openai@~0.0.28":
version "0.0.28" version "0.0.28"
resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.28.tgz#afaeec61b44816935db9ae937496c964c81ab571" resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.28.tgz#afaeec61b44816935db9ae937496c964c81ab571"
@ -559,13 +604,6 @@ brace-expansion@^1.1.7:
balanced-match "^1.0.0" balanced-match "^1.0.0"
concat-map "0.0.1" concat-map "0.0.1"
brace-expansion@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-2.0.1.tgz#1edc459e0f0c548486ecf9fc99f2221364b9a0ae"
integrity sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==
dependencies:
balanced-match "^1.0.0"
braces@~3.0.2: braces@~3.0.2:
version "3.0.2" version "3.0.2"
resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107" resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107"
@ -1769,6 +1807,13 @@ js-tiktoken@^1.0.11, js-tiktoken@^1.0.7, js-tiktoken@^1.0.8:
dependencies: dependencies:
base64-js "^1.5.1" base64-js "^1.5.1"
js-tiktoken@^1.0.12:
version "1.0.12"
resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.12.tgz#af0f5cf58e5e7318240d050c8413234019424211"
integrity sha512-L7wURW1fH9Qaext0VzaUDpFGVQgjkdE3Dgsy9/+yXyGEpBKnylTd0mU0bfbNkKDlXRb6TEsZkwuflu1B8uQbJQ==
dependencies:
base64-js "^1.5.1"
js-tokens@^4.0.0: js-tokens@^4.0.0:
version "4.0.0" version "4.0.0"
resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499" resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499"
@ -1844,6 +1889,28 @@ langchain@0.1.36:
zod "^3.22.4" zod "^3.22.4"
zod-to-json-schema "^3.22.3" zod-to-json-schema "^3.22.3"
langchain@~0.2.3:
version "0.2.12"
resolved "https://registry.yarnpkg.com/langchain/-/langchain-0.2.12.tgz#3fac0b9519a070689b6dd679d5854abc57824dcf"
integrity sha512-ZHtJrHUpridZ7IQu7N/wAQ6iMAAO7VLzkupHqKP79S6p+alrPbn1BjRnh+PeGm92YiY5DafTCuvchmujxx7bCQ==
dependencies:
"@langchain/core" ">=0.2.11 <0.3.0"
"@langchain/openai" ">=0.1.0 <0.3.0"
"@langchain/textsplitters" "~0.0.0"
binary-extensions "^2.2.0"
js-tiktoken "^1.0.12"
js-yaml "^4.1.0"
jsonpointer "^5.0.1"
langchainhub "~0.0.8"
langsmith "~0.1.30"
ml-distance "^4.0.0"
openapi-types "^12.1.3"
p-retry "4"
uuid "^10.0.0"
yaml "^2.2.1"
zod "^3.22.4"
zod-to-json-schema "^3.22.3"
langchainhub@~0.0.8: langchainhub@~0.0.8:
version "0.0.8" version "0.0.8"
resolved "https://registry.yarnpkg.com/langchainhub/-/langchainhub-0.0.8.tgz#fd4b96dc795e22e36c1a20bad31b61b0c33d3110" resolved "https://registry.yarnpkg.com/langchainhub/-/langchainhub-0.0.8.tgz#fd4b96dc795e22e36c1a20bad31b61b0c33d3110"
@ -1860,6 +1927,18 @@ langsmith@~0.1.1, langsmith@~0.1.7:
p-retry "4" p-retry "4"
uuid "^9.0.0" uuid "^9.0.0"
langsmith@~0.1.30, langsmith@~0.1.39:
version "0.1.40"
resolved "https://registry.yarnpkg.com/langsmith/-/langsmith-0.1.40.tgz#9708889386a5b9d0eb43dd3a9eba93513b57101d"
integrity sha512-11E2WLbh/+41+Qc0w8fJJTC/iz91BA+zXRMX/Wz0KSstnfzIPBoiWa++Kp2X8yCIDNywWWLJhy/B8gYzm7VKig==
dependencies:
"@types/uuid" "^9.0.1"
commander "^10.0.1"
p-queue "^6.6.2"
p-retry "4"
semver "^7.6.3"
uuid "^9.0.0"
leac@^0.6.0: leac@^0.6.0:
version "0.6.0" version "0.6.0"
resolved "https://registry.yarnpkg.com/leac/-/leac-0.6.0.tgz#dcf136e382e666bd2475f44a1096061b70dc0912" resolved "https://registry.yarnpkg.com/leac/-/leac-0.6.0.tgz#dcf136e382e666bd2475f44a1096061b70dc0912"
@ -2082,13 +2161,6 @@ mimic-response@^3.1.0:
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ== integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==
minimatch@5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-5.1.0.tgz#1717b464f4971b144f6aabe8f2d0b8e4511e09c7"
integrity sha512-9TPBGGak4nHfGZsPBohm9AWg6NoT7QTCehS3BIJABslyZbzxfV78QM2Y6+i741OPZIafFAaiiEMh5OyIrJPgtg==
dependencies:
brace-expansion "^2.0.1"
minimatch@^3.1.1, minimatch@^3.1.2: minimatch@^3.1.1, minimatch@^3.1.2:
version "3.1.2" version "3.1.2"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b"
@ -2417,6 +2489,19 @@ openai@^4.32.1:
node-fetch "^2.6.7" node-fetch "^2.6.7"
web-streams-polyfill "^3.2.1" web-streams-polyfill "^3.2.1"
openai@^4.49.1:
version "4.54.0"
resolved "https://registry.yarnpkg.com/openai/-/openai-4.54.0.tgz#eeb209c6892b997e524181b6ddb7e27bf4d09389"
integrity sha512-e/12BdtTtj+tXs7iHm+Dm7H7WjEWnw7O52B2wSfCQ6lD5F6cvjzo7cANXy5TJ1Q3/qc8YRPT5wBTTFtP5sBp1g==
dependencies:
"@types/node" "^18.11.18"
"@types/node-fetch" "^2.6.4"
abort-controller "^3.0.0"
agentkeepalive "^4.2.1"
form-data-encoder "1.7.2"
formdata-node "^4.3.2"
node-fetch "^2.6.7"
openapi-types@^12.1.3: openapi-types@^12.1.3:
version "12.1.3" version "12.1.3"
resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3" resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3"
@ -2863,6 +2948,11 @@ semver@^7.3.5, semver@^7.5.4:
dependencies: dependencies:
lru-cache "^6.0.0" lru-cache "^6.0.0"
semver@^7.6.3:
version "7.6.3"
resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.3.tgz#980f7b5550bc175fb4dc09403085627f9eb33143"
integrity sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==
semver@~7.0.0: semver@~7.0.0:
version "7.0.0" version "7.0.0"
resolved "https://registry.yarnpkg.com/semver/-/semver-7.0.0.tgz#5f3ca35761e47e05b206c6daff2cf814f0316b8e" resolved "https://registry.yarnpkg.com/semver/-/semver-7.0.0.tgz#5f3ca35761e47e05b206c6daff2cf814f0316b8e"
@ -3336,6 +3426,11 @@ utils-merge@1.0.1:
resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713" resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713"
integrity sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA== integrity sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==
uuid@^10.0.0:
version "10.0.0"
resolved "https://registry.yarnpkg.com/uuid/-/uuid-10.0.0.tgz#5a95aa454e6e002725c79055fd42aaba30ca6294"
integrity sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==
uuid@^9.0.0: uuid@^9.0.0:
version "9.0.1" version "9.0.1"
resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30" resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30"