mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 04:30:10 +01:00
1959 filetype filters (#2378)
* Updated the `GitHubRepoLoader` class to use the new import syntax and adjust the `recursiveLoader` method accordingly. * add @langchain/community to collector package.json * fix: Improve handling of complex ignore patterns in GitLabRepoLoader * refactor: use ignore package for simplified ignore logic * run yarn lint * add @langchain/community@^0.2.23 * remove unused dep lint --------- Co-authored-by: Emil Rofors (aider) <emirof@gmail.com>
This commit is contained in:
parent
b2123b13b0
commit
30645831a1
@ -15,6 +15,7 @@
|
||||
"lint": "yarn prettier --ignore-path ../.prettierignore --write ./processSingleFile ./processLink ./utils index.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"@langchain/community": "^0.2.23",
|
||||
"@xenova/transformers": "^2.11.0",
|
||||
"bcrypt": "^5.1.0",
|
||||
"body-parser": "^1.20.2",
|
||||
@ -30,7 +31,6 @@
|
||||
"mammoth": "^1.6.0",
|
||||
"mbox-parser": "^1.0.1",
|
||||
"mime": "^3.0.0",
|
||||
"minimatch": "5.1.0",
|
||||
"moment": "^2.29.4",
|
||||
"node-html-parser": "^6.1.13",
|
||||
"officeparser": "^4.0.5",
|
||||
|
@ -105,7 +105,7 @@ class GitHubRepoLoader {
|
||||
if (!this.ready) throw new Error("[Github Loader]: not in ready state!");
|
||||
const {
|
||||
GithubRepoLoader: LCGithubLoader,
|
||||
} = require("langchain/document_loaders/web/github");
|
||||
} = require("@langchain/community/document_loaders/web/github");
|
||||
|
||||
if (this.accessToken)
|
||||
console.log(
|
||||
@ -113,17 +113,16 @@ class GitHubRepoLoader {
|
||||
);
|
||||
|
||||
const loader = new LCGithubLoader(this.repo, {
|
||||
accessToken: this.accessToken,
|
||||
branch: this.branch,
|
||||
recursive: !!this.accessToken, // Recursive will hit rate limits.
|
||||
maxConcurrency: 5,
|
||||
unknown: "ignore",
|
||||
unknown: "warn",
|
||||
accessToken: this.accessToken,
|
||||
ignorePaths: this.ignorePaths,
|
||||
verbose: true,
|
||||
});
|
||||
|
||||
const docs = [];
|
||||
for await (const doc of loader.loadAsStream()) docs.push(doc);
|
||||
const docs = await loader.load();
|
||||
return docs;
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
const minimatch = require("minimatch");
|
||||
const ignore = require("ignore");
|
||||
|
||||
/**
|
||||
* @typedef {Object} RepoLoaderArgs
|
||||
@ -34,6 +34,7 @@ class GitLabRepoLoader {
|
||||
this.branch = args?.branch;
|
||||
this.accessToken = args?.accessToken || null;
|
||||
this.ignorePaths = args?.ignorePaths || [];
|
||||
this.ignoreFilter = ignore().add(this.ignorePaths);
|
||||
this.withIssues = args?.fetchIssues || false;
|
||||
|
||||
this.projectId = null;
|
||||
@ -137,7 +138,7 @@ class GitLabRepoLoader {
|
||||
console.log(`[Gitlab Loader]: Fetched ${files.length} files.`);
|
||||
|
||||
for (const file of files) {
|
||||
if (this.ignorePaths.some((path) => file.path.includes(path))) continue;
|
||||
if (this.ignoreFilter.ignores(file.path)) continue;
|
||||
|
||||
docs.push({
|
||||
pageContent: file.content,
|
||||
@ -216,13 +217,8 @@ class GitLabRepoLoader {
|
||||
// Fetch all the files that are not ignored in parallel.
|
||||
pagePromises = filesPage
|
||||
.filter((file) => {
|
||||
if (file.type !== "blob") {
|
||||
return false;
|
||||
}
|
||||
const isIgnored = this.ignorePaths.some((ignorePattern) =>
|
||||
minimatch(file.path, ignorePattern, { matchBase: true })
|
||||
);
|
||||
return !isIgnored;
|
||||
if (file.type !== "blob") return false;
|
||||
return !this.ignoreFilter.ignores(file.path);
|
||||
})
|
||||
.map(async (file) => {
|
||||
const content = await this.fetchSingleFileContents(file.path);
|
||||
|
@ -64,6 +64,23 @@
|
||||
resolved "https://registry.yarnpkg.com/@huggingface/jinja/-/jinja-0.2.2.tgz#faeb205a9d6995089bef52655ddd8245d3190627"
|
||||
integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==
|
||||
|
||||
"@langchain/community@^0.2.23":
|
||||
version "0.2.23"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.2.23.tgz#20560e107bcc8432c42e499f1b9292d41a3732f2"
|
||||
integrity sha512-p1n/zZ1F+O5l51RzeoUeJyhpzq6Wp11tkqKOj8oThKOQJgLhO7q6iFIvmKThzL7mZCNNuPM5r1OPnU4wO6iF/A==
|
||||
dependencies:
|
||||
"@langchain/core" ">=0.2.16 <0.3.0"
|
||||
"@langchain/openai" ">=0.1.0 <0.3.0"
|
||||
binary-extensions "^2.2.0"
|
||||
expr-eval "^2.0.2"
|
||||
flat "^5.0.2"
|
||||
js-yaml "^4.1.0"
|
||||
langchain "~0.2.3"
|
||||
langsmith "~0.1.30"
|
||||
uuid "^10.0.0"
|
||||
zod "^3.22.3"
|
||||
zod-to-json-schema "^3.22.5"
|
||||
|
||||
"@langchain/community@~0.0.47":
|
||||
version "0.0.53"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.53.tgz#a9aaedffa0ed2977e8d302d74e9f90a49a6da037"
|
||||
@ -78,6 +95,23 @@
|
||||
zod "^3.22.3"
|
||||
zod-to-json-schema "^3.22.5"
|
||||
|
||||
"@langchain/core@>=0.2.11 <0.3.0", "@langchain/core@>=0.2.16 <0.3.0":
|
||||
version "0.2.20"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.2.20.tgz#5115781b0a86db3ce4b697e473405892c09621ca"
|
||||
integrity sha512-WPBjrzOj79/yqjloDUIw1GDhuRQfHis07TyyDj+qS81nHh0svSasetKcqAZ3L5JoPcBmEL7rRBtM+OcyC3mLVg==
|
||||
dependencies:
|
||||
ansi-styles "^5.0.0"
|
||||
camelcase "6"
|
||||
decamelize "1.2.0"
|
||||
js-tiktoken "^1.0.12"
|
||||
langsmith "~0.1.39"
|
||||
mustache "^4.2.0"
|
||||
p-queue "^6.6.2"
|
||||
p-retry "4"
|
||||
uuid "^10.0.0"
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.3"
|
||||
|
||||
"@langchain/core@~0.1", "@langchain/core@~0.1.56", "@langchain/core@~0.1.60":
|
||||
version "0.1.61"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736"
|
||||
@ -96,6 +130,17 @@
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.3"
|
||||
|
||||
"@langchain/openai@>=0.1.0 <0.3.0":
|
||||
version "0.2.5"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.2.5.tgz#e85b983986a7415ea743d4c854bb0674134334d4"
|
||||
integrity sha512-gQXS5VBFyAco0jgSnUVan6fYVSIxlffmDaeDGpXrAmz2nQPgiN/h24KYOt2NOZ1zRheRzRuO/CfRagMhyVUaFA==
|
||||
dependencies:
|
||||
"@langchain/core" ">=0.2.16 <0.3.0"
|
||||
js-tiktoken "^1.0.12"
|
||||
openai "^4.49.1"
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.3"
|
||||
|
||||
"@langchain/openai@~0.0.28":
|
||||
version "0.0.28"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.28.tgz#afaeec61b44816935db9ae937496c964c81ab571"
|
||||
@ -559,13 +604,6 @@ brace-expansion@^1.1.7:
|
||||
balanced-match "^1.0.0"
|
||||
concat-map "0.0.1"
|
||||
|
||||
brace-expansion@^2.0.1:
|
||||
version "2.0.1"
|
||||
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-2.0.1.tgz#1edc459e0f0c548486ecf9fc99f2221364b9a0ae"
|
||||
integrity sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==
|
||||
dependencies:
|
||||
balanced-match "^1.0.0"
|
||||
|
||||
braces@~3.0.2:
|
||||
version "3.0.2"
|
||||
resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107"
|
||||
@ -1769,6 +1807,13 @@ js-tiktoken@^1.0.11, js-tiktoken@^1.0.7, js-tiktoken@^1.0.8:
|
||||
dependencies:
|
||||
base64-js "^1.5.1"
|
||||
|
||||
js-tiktoken@^1.0.12:
|
||||
version "1.0.12"
|
||||
resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.12.tgz#af0f5cf58e5e7318240d050c8413234019424211"
|
||||
integrity sha512-L7wURW1fH9Qaext0VzaUDpFGVQgjkdE3Dgsy9/+yXyGEpBKnylTd0mU0bfbNkKDlXRb6TEsZkwuflu1B8uQbJQ==
|
||||
dependencies:
|
||||
base64-js "^1.5.1"
|
||||
|
||||
js-tokens@^4.0.0:
|
||||
version "4.0.0"
|
||||
resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499"
|
||||
@ -1844,6 +1889,28 @@ langchain@0.1.36:
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.3"
|
||||
|
||||
langchain@~0.2.3:
|
||||
version "0.2.12"
|
||||
resolved "https://registry.yarnpkg.com/langchain/-/langchain-0.2.12.tgz#3fac0b9519a070689b6dd679d5854abc57824dcf"
|
||||
integrity sha512-ZHtJrHUpridZ7IQu7N/wAQ6iMAAO7VLzkupHqKP79S6p+alrPbn1BjRnh+PeGm92YiY5DafTCuvchmujxx7bCQ==
|
||||
dependencies:
|
||||
"@langchain/core" ">=0.2.11 <0.3.0"
|
||||
"@langchain/openai" ">=0.1.0 <0.3.0"
|
||||
"@langchain/textsplitters" "~0.0.0"
|
||||
binary-extensions "^2.2.0"
|
||||
js-tiktoken "^1.0.12"
|
||||
js-yaml "^4.1.0"
|
||||
jsonpointer "^5.0.1"
|
||||
langchainhub "~0.0.8"
|
||||
langsmith "~0.1.30"
|
||||
ml-distance "^4.0.0"
|
||||
openapi-types "^12.1.3"
|
||||
p-retry "4"
|
||||
uuid "^10.0.0"
|
||||
yaml "^2.2.1"
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.3"
|
||||
|
||||
langchainhub@~0.0.8:
|
||||
version "0.0.8"
|
||||
resolved "https://registry.yarnpkg.com/langchainhub/-/langchainhub-0.0.8.tgz#fd4b96dc795e22e36c1a20bad31b61b0c33d3110"
|
||||
@ -1860,6 +1927,18 @@ langsmith@~0.1.1, langsmith@~0.1.7:
|
||||
p-retry "4"
|
||||
uuid "^9.0.0"
|
||||
|
||||
langsmith@~0.1.30, langsmith@~0.1.39:
|
||||
version "0.1.40"
|
||||
resolved "https://registry.yarnpkg.com/langsmith/-/langsmith-0.1.40.tgz#9708889386a5b9d0eb43dd3a9eba93513b57101d"
|
||||
integrity sha512-11E2WLbh/+41+Qc0w8fJJTC/iz91BA+zXRMX/Wz0KSstnfzIPBoiWa++Kp2X8yCIDNywWWLJhy/B8gYzm7VKig==
|
||||
dependencies:
|
||||
"@types/uuid" "^9.0.1"
|
||||
commander "^10.0.1"
|
||||
p-queue "^6.6.2"
|
||||
p-retry "4"
|
||||
semver "^7.6.3"
|
||||
uuid "^9.0.0"
|
||||
|
||||
leac@^0.6.0:
|
||||
version "0.6.0"
|
||||
resolved "https://registry.yarnpkg.com/leac/-/leac-0.6.0.tgz#dcf136e382e666bd2475f44a1096061b70dc0912"
|
||||
@ -2082,13 +2161,6 @@ mimic-response@^3.1.0:
|
||||
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
|
||||
integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==
|
||||
|
||||
minimatch@5.1.0:
|
||||
version "5.1.0"
|
||||
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-5.1.0.tgz#1717b464f4971b144f6aabe8f2d0b8e4511e09c7"
|
||||
integrity sha512-9TPBGGak4nHfGZsPBohm9AWg6NoT7QTCehS3BIJABslyZbzxfV78QM2Y6+i741OPZIafFAaiiEMh5OyIrJPgtg==
|
||||
dependencies:
|
||||
brace-expansion "^2.0.1"
|
||||
|
||||
minimatch@^3.1.1, minimatch@^3.1.2:
|
||||
version "3.1.2"
|
||||
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b"
|
||||
@ -2417,6 +2489,19 @@ openai@^4.32.1:
|
||||
node-fetch "^2.6.7"
|
||||
web-streams-polyfill "^3.2.1"
|
||||
|
||||
openai@^4.49.1:
|
||||
version "4.54.0"
|
||||
resolved "https://registry.yarnpkg.com/openai/-/openai-4.54.0.tgz#eeb209c6892b997e524181b6ddb7e27bf4d09389"
|
||||
integrity sha512-e/12BdtTtj+tXs7iHm+Dm7H7WjEWnw7O52B2wSfCQ6lD5F6cvjzo7cANXy5TJ1Q3/qc8YRPT5wBTTFtP5sBp1g==
|
||||
dependencies:
|
||||
"@types/node" "^18.11.18"
|
||||
"@types/node-fetch" "^2.6.4"
|
||||
abort-controller "^3.0.0"
|
||||
agentkeepalive "^4.2.1"
|
||||
form-data-encoder "1.7.2"
|
||||
formdata-node "^4.3.2"
|
||||
node-fetch "^2.6.7"
|
||||
|
||||
openapi-types@^12.1.3:
|
||||
version "12.1.3"
|
||||
resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3"
|
||||
@ -2863,6 +2948,11 @@ semver@^7.3.5, semver@^7.5.4:
|
||||
dependencies:
|
||||
lru-cache "^6.0.0"
|
||||
|
||||
semver@^7.6.3:
|
||||
version "7.6.3"
|
||||
resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.3.tgz#980f7b5550bc175fb4dc09403085627f9eb33143"
|
||||
integrity sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==
|
||||
|
||||
semver@~7.0.0:
|
||||
version "7.0.0"
|
||||
resolved "https://registry.yarnpkg.com/semver/-/semver-7.0.0.tgz#5f3ca35761e47e05b206c6daff2cf814f0316b8e"
|
||||
@ -3336,6 +3426,11 @@ utils-merge@1.0.1:
|
||||
resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713"
|
||||
integrity sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==
|
||||
|
||||
uuid@^10.0.0:
|
||||
version "10.0.0"
|
||||
resolved "https://registry.yarnpkg.com/uuid/-/uuid-10.0.0.tgz#5a95aa454e6e002725c79055fd42aaba30ca6294"
|
||||
integrity sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==
|
||||
|
||||
uuid@^9.0.0:
|
||||
version "9.0.1"
|
||||
resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30"
|
||||
|
Loading…
Reference in New Issue
Block a user