diff --git a/.vscode/settings.json b/.vscode/settings.json index 450dd779..c8c7ea99 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,6 @@ { "cSpell.words": [ - "openai" + "openai", + "Weaviate" ] } \ No newline at end of file diff --git a/docker/.env.example b/docker/.env.example index 6b9791eb..77550b6f 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -32,6 +32,11 @@ PINECONE_INDEX= # Enable all below if you are using vector database: LanceDB. # VECTOR_DB="lancedb" +# Enable all below if you are using vector database: Weaviate. +# VECTOR_DB="weaviate" +# WEAVIATE_ENDPOINT="http://localhost:8080" +# WEAVIATE_API_KEY= + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # NO_DEBUG="true" diff --git a/frontend/src/components/Modals/Settings/ExportImport/index.jsx b/frontend/src/components/Modals/Settings/ExportImport/index.jsx index 4099e8c0..e2245d53 100644 --- a/frontend/src/components/Modals/Settings/ExportImport/index.jsx +++ b/frontend/src/components/Modals/Settings/ExportImport/index.jsx @@ -7,7 +7,7 @@ import paths from "../../../../utils/paths"; const noop = () => false; export default function ExportOrImportData({ hideModal = noop }) { return ( -
+

diff --git a/frontend/src/components/Modals/Settings/LLMSelection/index.jsx b/frontend/src/components/Modals/Settings/LLMSelection/index.jsx index 94b75ace..2cf01435 100644 --- a/frontend/src/components/Modals/Settings/LLMSelection/index.jsx +++ b/frontend/src/components/Modals/Settings/LLMSelection/index.jsx @@ -37,7 +37,7 @@ export default function LLMSelection({ setHasChanges(!!error ? true : false); }; return ( -

+

@@ -59,7 +59,7 @@ export default function LLMSelection({

LLM providers

-
+
+

diff --git a/frontend/src/components/Modals/Settings/PasswordProtection/index.jsx b/frontend/src/components/Modals/Settings/PasswordProtection/index.jsx index 387c44bc..5e626912 100644 --- a/frontend/src/components/Modals/Settings/PasswordProtection/index.jsx +++ b/frontend/src/components/Modals/Settings/PasswordProtection/index.jsx @@ -41,7 +41,7 @@ export default function PasswordProtection({ }; return ( -

+

diff --git a/frontend/src/components/Modals/Settings/VectorDbs/index.jsx b/frontend/src/components/Modals/Settings/VectorDbs/index.jsx index c4ad0aec..b1a5a97b 100644 --- a/frontend/src/components/Modals/Settings/VectorDbs/index.jsx +++ b/frontend/src/components/Modals/Settings/VectorDbs/index.jsx @@ -3,6 +3,7 @@ import System from "../../../../models/system"; import ChromaLogo from "../../../../media/vectordbs/chroma.png"; import PineconeLogo from "../../../../media/vectordbs/pinecone.png"; import LanceDbLogo from "../../../../media/vectordbs/lancedb.png"; +import WeaviateLogo from "../../../../media/vectordbs/weaviate.png"; const noop = () => false; export default function VectorDBSelection({ @@ -37,7 +38,7 @@ export default function VectorDBSelection({ setHasChanges(!!error ? true : false); }; return ( -

+

@@ -59,7 +60,7 @@ export default function VectorDBSelection({

Vector database providers

-
+
+
)} + {vectorDB === "weaviate" && ( + <> +
+ + +
+
+ + +
+ + )}
diff --git a/frontend/src/components/Modals/Settings/index.jsx b/frontend/src/components/Modals/Settings/index.jsx index bdf8e6e5..f644c5e1 100644 --- a/frontend/src/components/Modals/Settings/index.jsx +++ b/frontend/src/components/Modals/Settings/index.jsx @@ -46,7 +46,7 @@ export default function SystemSettingsModal({ hideModal = noop }) { className="flex fixed top-0 left-0 right-0 w-full h-full" onClick={hideModal} /> -
+
diff --git a/frontend/src/media/vectordbs/weaviate.png b/frontend/src/media/vectordbs/weaviate.png new file mode 100644 index 00000000..d7980bf6 Binary files /dev/null and b/frontend/src/media/vectordbs/weaviate.png differ diff --git a/server/.env.example b/server/.env.example index e06c0f7e..606dd898 100644 --- a/server/.env.example +++ b/server/.env.example @@ -31,6 +31,12 @@ PINECONE_INDEX= # Enable all below if you are using vector database: LanceDB. # VECTOR_DB="lancedb" +# Enable all below if you are using vector database: Weaviate. +# VECTOR_DB="weaviate" +# WEAVIATE_ENDPOINT="http://localhost:8080" +# WEAVIATE_API_KEY= + + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # STORAGE_DIR= # absolute filesystem path with no trailing slash diff --git a/server/endpoints/system.js b/server/endpoints/system.js index 98354f4a..01a367c7 100644 --- a/server/endpoints/system.js +++ b/server/endpoints/system.js @@ -60,6 +60,12 @@ function systemEndpoints(app) { ChromaEndpoint: process.env.CHROMA_ENDPOINT, } : {}), + ...(vectorDB === "weaviate" + ? { + WeaviateEndpoint: process.env.WEAVIATE_ENDPOINT, + WeaviateApiKey: process.env.WEAVIATE_API_KEY, + } + : {}), LLMProvider: llmProvider, ...(llmProvider === "openai" ? { diff --git a/server/package.json b/server/package.json index 3d8ec2b8..e35eb69d 100644 --- a/server/package.json +++ b/server/package.json @@ -26,6 +26,7 @@ "dotenv": "^16.0.3", "express": "^4.18.2", "extract-zip": "^2.0.1", + "graphql": "^16.7.1", "jsonwebtoken": "^8.5.1", "langchain": "^0.0.90", "moment": "^2.29.4", @@ -38,7 +39,8 @@ "sqlite3": "^5.1.6", "uuid": "^9.0.0", "uuid-apikey": "^1.5.3", - "vectordb": "0.1.12" + "vectordb": "0.1.12", + "weaviate-ts-client": "^1.4.0" }, "devDependencies": { "nodemon": "^2.0.22", diff --git a/server/utils/helpers/camelcase.js b/server/utils/helpers/camelcase.js new file mode 100644 index 00000000..4a8e1b28 --- /dev/null +++ b/server/utils/helpers/camelcase.js @@ -0,0 +1,143 @@ +const UPPERCASE = /[\p{Lu}]/u; +const LOWERCASE = /[\p{Ll}]/u; +const LEADING_CAPITAL = /^[\p{Lu}](?![\p{Lu}])/gu; +const IDENTIFIER = /([\p{Alpha}\p{N}_]|$)/u; +const SEPARATORS = /[_.\- ]+/; + +const LEADING_SEPARATORS = new RegExp("^" + SEPARATORS.source); +const SEPARATORS_AND_IDENTIFIER = new RegExp( + SEPARATORS.source + IDENTIFIER.source, + "gu" +); +const NUMBERS_AND_IDENTIFIER = new RegExp("\\d+" + IDENTIFIER.source, "gu"); + +const preserveCamelCase = ( + string, + toLowerCase, + toUpperCase, + preserveConsecutiveUppercase +) => { + let isLastCharLower = false; + let isLastCharUpper = false; + let isLastLastCharUpper = false; + let isLastLastCharPreserved = false; + + for (let index = 0; index < string.length; index++) { + const character = string[index]; + isLastLastCharPreserved = index > 2 ? string[index - 3] === "-" : true; + + if (isLastCharLower && UPPERCASE.test(character)) { + string = string.slice(0, index) + "-" + string.slice(index); + isLastCharLower = false; + isLastLastCharUpper = isLastCharUpper; + isLastCharUpper = true; + index++; + } else if ( + isLastCharUpper && + isLastLastCharUpper && + LOWERCASE.test(character) && + (!isLastLastCharPreserved || preserveConsecutiveUppercase) + ) { + string = string.slice(0, index - 1) + "-" + string.slice(index - 1); + isLastLastCharUpper = isLastCharUpper; + isLastCharUpper = false; + isLastCharLower = true; + } else { + isLastCharLower = + toLowerCase(character) === character && + toUpperCase(character) !== character; + isLastLastCharUpper = isLastCharUpper; + isLastCharUpper = + toUpperCase(character) === character && + toLowerCase(character) !== character; + } + } + + return string; +}; + +const preserveConsecutiveUppercase = (input, toLowerCase) => { + LEADING_CAPITAL.lastIndex = 0; + + return input.replace(LEADING_CAPITAL, (m1) => toLowerCase(m1)); +}; + +const postProcess = (input, toUpperCase) => { + SEPARATORS_AND_IDENTIFIER.lastIndex = 0; + NUMBERS_AND_IDENTIFIER.lastIndex = 0; + + return input + .replace(SEPARATORS_AND_IDENTIFIER, (_, identifier) => + toUpperCase(identifier) + ) + .replace(NUMBERS_AND_IDENTIFIER, (m) => toUpperCase(m)); +}; + +function camelCase(input, options) { + if (!(typeof input === "string" || Array.isArray(input))) { + throw new TypeError("Expected the input to be `string | string[]`"); + } + + options = { + pascalCase: true, + preserveConsecutiveUppercase: false, + ...options, + }; + + if (Array.isArray(input)) { + input = input + .map((x) => x.trim()) + .filter((x) => x.length) + .join("-"); + } else { + input = input.trim(); + } + + if (input.length === 0) { + return ""; + } + + const toLowerCase = + options.locale === false + ? (string) => string.toLowerCase() + : (string) => string.toLocaleLowerCase(options.locale); + + const toUpperCase = + options.locale === false + ? (string) => string.toUpperCase() + : (string) => string.toLocaleUpperCase(options.locale); + + if (input.length === 1) { + if (SEPARATORS.test(input)) { + return ""; + } + + return options.pascalCase ? toUpperCase(input) : toLowerCase(input); + } + + const hasUpperCase = input !== toLowerCase(input); + + if (hasUpperCase) { + input = preserveCamelCase( + input, + toLowerCase, + toUpperCase, + options.preserveConsecutiveUppercase + ); + } + + input = input.replace(LEADING_SEPARATORS, ""); + input = options.preserveConsecutiveUppercase + ? preserveConsecutiveUppercase(input, toLowerCase) + : toLowerCase(input); + + if (options.pascalCase) { + input = toUpperCase(input.charAt(0)) + input.slice(1); + } + + return postProcess(input, toUpperCase); +} + +module.exports = { + camelCase, +}; diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js index 5be56507..b7fb5ae0 100644 --- a/server/utils/helpers/index.js +++ b/server/utils/helpers/index.js @@ -10,6 +10,9 @@ function getVectorDbClass() { case "lancedb": const { LanceDb } = require("../vectorDbProviders/lance"); return LanceDb; + case "weaviate": + const { Weaviate } = require("../vectorDbProviders/weaviate"); + return Weaviate; default: throw new Error("ENV: No VECTOR_DB value found in environment!"); } diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 64c91988..9f00ec42 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -39,6 +39,15 @@ const KEY_MAPPING = { envKey: "CHROMA_ENDPOINT", checks: [isValidURL, validChromaURL], }, + WeaviateEndpoint: { + envKey: "WEAVIATE_ENDPOINT", + checks: [isValidURL], + }, + WeaviateApiKey: { + envKey: "WEAVIATE_API_KEY", + checks: [], + }, + PineConeEnvironment: { envKey: "PINECONE_ENVIRONMENT", checks: [], @@ -103,7 +112,7 @@ function validOpenAIModel(input = "") { } function supportedVectorDB(input = "") { - const supported = ["chroma", "pinecone", "lancedb"]; + const supported = ["chroma", "pinecone", "lancedb", "weaviate"]; return supported.includes(input) ? null : `Invalid VectorDB type. Must be one of ${supported.join(", ")}.`; diff --git a/server/utils/vectorDbProviders/weaviate/WEAVIATE_SETUP.md b/server/utils/vectorDbProviders/weaviate/WEAVIATE_SETUP.md new file mode 100644 index 00000000..fc0acaec --- /dev/null +++ b/server/utils/vectorDbProviders/weaviate/WEAVIATE_SETUP.md @@ -0,0 +1,17 @@ +# How to setup a local (or cloud) Weaviate Vector Database + +[Get a Weaviate Cloud instance](https://weaviate.io/developers/weaviate/quickstart#create-an-instance). +[Set up Weaviate locally on Docker](https://weaviate.io/developers/weaviate/installation/docker-compose). + +Fill out the variables in the "Vector Database" tab of settings. Select Weaviate as your provider and fill out the appropriate fields +with the information from either of the above steps. + +### How to get started _Development mode only_ + +After setting up either the Weaviate cloud or local dockerized instance you just need to set these variable in `.env.development` or defined them at runtime via the UI. + +``` +VECTOR_DB="weaviate" +WEAVIATE_ENDPOINT='http://localhost:8080' +WEAVIATE_API_KEY= # Optional +``` diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js new file mode 100644 index 00000000..884c08e0 --- /dev/null +++ b/server/utils/vectorDbProviders/weaviate/index.js @@ -0,0 +1,503 @@ +const { default: weaviate } = require("weaviate-ts-client"); +const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { storeVectorResult, cachedVectorInformation } = require("../../files"); +const { v4: uuidv4 } = require("uuid"); +const { toChunks, getLLMProvider } = require("../../helpers"); +const { chatPrompt } = require("../../chats"); +const { camelCase } = require("../../helpers/camelcase"); + +const Weaviate = { + name: "Weaviate", + connect: async function () { + if (process.env.VECTOR_DB !== "weaviate") + throw new Error("Weaviate::Invalid ENV settings"); + + const weaviateUrl = new URL(process.env.WEAVIATE_ENDPOINT); + const options = { + scheme: weaviateUrl.protocol?.replace(":", "") || "http", + host: weaviateUrl?.host, + ...(process.env?.WEAVIATE_API_KEY?.length > 0 + ? { apiKey: new weaviate.ApiKey(process.env?.WEAVIATE_API_KEY) } + : {}), + }; + const client = weaviate.client(options); + const isAlive = await await client.misc.liveChecker().do(); + if (!isAlive) + throw new Error( + "Weaviate::Invalid Alive signal received - is the service online?" + ); + return { client }; + }, + heartbeat: async function () { + await this.connect(); + return { heartbeat: Number(new Date()) }; + }, + totalIndicies: async function () { + const { client } = await this.connect(); + const collectionNames = await this.allNamespaces(client); + var totalVectors = 0; + for (const name of collectionNames) { + totalVectors += await this.namespaceCountWithClient(client, name); + } + return totalVectors; + }, + namespaceCountWithClient: async function (client, namespace) { + try { + const response = await client.graphql + .aggregate() + .withClassName(camelCase(namespace)) + .withFields("meta { count }") + .do(); + return ( + response?.data?.Aggregate?.[camelCase(namespace)]?.[0]?.meta?.count || 0 + ); + } catch (e) { + console.error(`Weaviate:namespaceCountWithClient`, e.message); + return 0; + } + }, + namespaceCount: async function (namespace = null) { + try { + const { client } = await this.connect(); + const response = await client.graphql + .aggregate() + .withClassName(camelCase(namespace)) + .withFields("meta { count }") + .do(); + + return ( + response?.data?.Aggregate?.[camelCase(namespace)]?.[0]?.meta?.count || 0 + ); + } catch (e) { + console.error(`Weaviate:namespaceCountWithClient`, e.message); + return 0; + } + }, + similarityResponse: async function (client, namespace, queryVector) { + const result = { + contextTexts: [], + sourceDocuments: [], + }; + + const weaviateClass = await this.namespace(client, namespace); + const fields = weaviateClass.properties.map((prop) => prop.name).join(" "); + const queryResponse = await client.graphql + .get() + .withClassName(camelCase(namespace)) + .withFields(`${fields} _additional { id }`) + .withNearVector({ vector: queryVector }) + .withLimit(4) + .do(); + + const responses = queryResponse?.data?.Get?.[camelCase(namespace)]; + responses.forEach((response) => { + // In Weaviate we have to pluck id from _additional and spread it into the rest + // of the properties. + const { + _additional: { id }, + ...rest + } = response; + result.contextTexts.push(rest.text); + result.sourceDocuments.push({ ...rest, id }); + }); + + return result; + }, + allNamespaces: async function (client) { + try { + const { classes = [] } = await client.schema.getter().do(); + return classes.map((classObj) => classObj.class); + } catch (e) { + console.error("Weaviate::AllNamespace", e); + return []; + } + }, + namespace: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + if (!(await this.namespaceExists(client, namespace))) return null; + + const weaviateClass = await client.schema + .classGetter() + .withClassName(camelCase(namespace)) + .do(); + + return { + ...weaviateClass, + vectorCount: await this.namespaceCount(namespace), + }; + }, + addVectors: async function (client, vectors = []) { + const response = { success: true, errors: new Set([]) }; + const results = await client.batch + .objectsBatcher() + .withObjects(...vectors) + .do(); + + results.forEach((res) => { + const { status, errors = [] } = res.result; + if (status === "SUCCESS" || errors.length === 0) return; + response.success = false; + response.errors.add(errors.error?.[0]?.message || null); + }); + + response.errors = [...response.errors]; + return response; + }, + hasNamespace: async function (namespace = null) { + if (!namespace) return false; + const { client } = await this.connect(); + const weaviateClasses = await this.allNamespaces(client); + return weaviateClasses.includes(camelCase(namespace)); + }, + namespaceExists: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + const weaviateClasses = await this.allNamespaces(client); + return weaviateClasses.includes(camelCase(namespace)); + }, + deleteVectorsInNamespace: async function (client, namespace = null) { + await client.schema.classDeleter().withClassName(camelCase(namespace)).do(); + return true; + }, + addDocumentToNamespace: async function ( + namespace, + documentData = {}, + fullFilePath = null + ) { + const { DocumentVectors } = require("../../../models/vectors"); + try { + const { + pageContent, + docId, + id: _id, // Weaviate will abort if `id` is present in properties + ...metadata + } = documentData; + if (!pageContent || pageContent.length == 0) return false; + + console.log("Adding new vectorized document into namespace", namespace); + const cacheResult = await cachedVectorInformation(fullFilePath); + if (cacheResult.exists) { + const { client } = await this.connect(); + const weaviateClassExits = await this.hasNamespace(namespace); + if (!weaviateClassExits) { + await client.schema + .classCreator() + .withClass({ + class: camelCase(namespace), + description: `Class created by AnythingLLM named ${camelCase( + namespace + )}`, + vectorizer: "none", + }) + .do(); + } + + const { chunks } = cacheResult; + const documentVectors = []; + const vectors = []; + + for (const chunk of chunks) { + // Before sending to Weaviate and saving the records to our db + // we need to assign the id of each chunk that is stored in the cached file. + chunk.forEach((chunk) => { + const id = uuidv4(); + const flattenedMetadata = this.flattenObjectForWeaviate( + chunk.properties + ); + documentVectors.push({ docId, vectorId: id }); + const vectorRecord = { + id, + class: camelCase(namespace), + vector: chunk.vector || chunk.values || [], + properties: { ...flattenedMetadata }, + }; + vectors.push(vectorRecord); + }); + + const { success: additionResult, errors = [] } = + await this.addVectors(client, vectors); + if (!additionResult) { + console.error("Weaviate::addVectors failed to insert", errors); + throw new Error("Error embedding into Weaviate"); + } + } + + await DocumentVectors.bulkInsert(documentVectors); + return true; + } + + // If we are here then we are going to embed and store a novel document. + // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments` + // because we then cannot atomically control our namespace to granularly find/remove documents + // from vectordb. + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 20, + }); + const textChunks = await textSplitter.splitText(pageContent); + + console.log("Chunks created from document:", textChunks.length); + const LLMConnector = getLLMProvider(); + const documentVectors = []; + const vectors = []; + const vectorValues = await LLMConnector.embedChunks(textChunks); + const submission = { + ids: [], + vectors: [], + properties: [], + }; + + if (!!vectorValues && vectorValues.length > 0) { + for (const [i, vector] of vectorValues.entries()) { + const flattenedMetadata = this.flattenObjectForWeaviate(metadata); + const vectorRecord = { + class: camelCase(namespace), + id: uuidv4(), + vector: vector, + // [DO NOT REMOVE] + // LangChain will be unable to find your text if you embed manually and dont include the `text` key. + // https://github.com/hwchase17/langchainjs/blob/5485c4af50c063e257ad54f4393fa79e0aff6462/langchain/src/vectorstores/weaviate.ts#L133 + properties: { ...flattenedMetadata, text: textChunks[i] }, + }; + + submission.ids.push(vectorRecord.id); + submission.vectors.push(vectorRecord.values); + submission.properties.push(metadata); + + vectors.push(vectorRecord); + documentVectors.push({ docId, vectorId: vectorRecord.id }); + } + } else { + console.error( + "Could not use OpenAI to embed document chunks! This document will not be recorded." + ); + } + + const { client } = await this.connect(); + const weaviateClassExits = await this.hasNamespace(namespace); + if (!weaviateClassExits) { + await client.schema + .classCreator() + .withClass({ + class: camelCase(namespace), + description: `Class created by AnythingLLM named ${camelCase( + namespace + )}`, + vectorizer: "none", + }) + .do(); + } + + if (vectors.length > 0) { + const chunks = []; + for (const chunk of toChunks(vectors, 500)) chunks.push(chunk); + + console.log("Inserting vectorized chunks into Weaviate collection."); + const { success: additionResult, errors = [] } = await this.addVectors( + client, + vectors + ); + if (!additionResult) { + console.error("Weaviate::addVectors failed to insert", errors); + throw new Error("Error embedding into Weaviate"); + } + await storeVectorResult(chunks, fullFilePath); + } + + await DocumentVectors.bulkInsert(documentVectors); + return true; + } catch (e) { + console.error(e); + console.error("addDocumentToNamespace", e.message); + return false; + } + }, + deleteDocumentFromNamespace: async function (namespace, docId) { + const { DocumentVectors } = require("../../../models/vectors"); + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) return; + + const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`); + if (knownDocuments.length === 0) return; + + for (const doc of knownDocuments) { + await client.data + .deleter() + .withClassName(camelCase(namespace)) + .withId(doc.vectorId) + .do(); + } + + const indexes = knownDocuments.map((doc) => doc.id); + await DocumentVectors.deleteIds(indexes); + return true; + }, + query: async function (reqBody = {}) { + const { namespace = null, input, workspace = {} } = reqBody; + if (!namespace || !input) throw new Error("Invalid request body"); + + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) { + return { + response: null, + sources: [], + message: "Invalid query - no documents found for workspace!", + }; + } + + const LLMConnector = getLLMProvider(); + const queryVector = await LLMConnector.embedTextInput(input); + const { contextTexts, sourceDocuments } = await this.similarityResponse( + client, + namespace, + queryVector + ); + + const prompt = { + role: "system", + content: `${chatPrompt(workspace)} + Context: + ${contextTexts + .map((text, i) => { + return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`; + }) + .join("")}`, + }; + const memory = [prompt, { role: "user", content: input }]; + const responseText = await LLMConnector.getChatCompletion(memory, { + temperature: workspace?.openAiTemp ?? 0.7, + }); + + return { + response: responseText, + sources: this.curateSources(sourceDocuments), + message: false, + }; + }, + // This implementation of chat uses the chat history and modifies the system prompt at execution + // this is improved over the regular langchain implementation so that chats do not directly modify embeddings + // because then multi-user support will have all conversations mutating the base vector collection to which then + // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs + chat: async function (reqBody = {}) { + const { + namespace = null, + input, + workspace = {}, + chatHistory = [], + } = reqBody; + if (!namespace || !input) throw new Error("Invalid request body"); + + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) { + return { + response: null, + sources: [], + message: "Invalid query - no documents found for workspace!", + }; + } + + const LLMConnector = getLLMProvider(); + const queryVector = await LLMConnector.embedTextInput(input); + const { contextTexts, sourceDocuments } = await this.similarityResponse( + client, + namespace, + queryVector + ); + const prompt = { + role: "system", + content: `${chatPrompt(workspace)} + Context: + ${contextTexts + .map((text, i) => { + return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`; + }) + .join("")}`, + }; + const memory = [prompt, ...chatHistory, { role: "user", content: input }]; + const responseText = await LLMConnector.getChatCompletion(memory, { + temperature: workspace?.openAiTemp ?? 0.7, + }); + + return { + response: responseText, + sources: this.curateSources(sourceDocuments), + message: false, + }; + }, + "namespace-stats": async function (reqBody = {}) { + const { namespace = null } = reqBody; + if (!namespace) throw new Error("namespace required"); + const { client } = await this.connect(); + const stats = await this.namespace(client, namespace); + return stats + ? stats + : { message: "No stats were able to be fetched from DB for namespace" }; + }, + "delete-namespace": async function (reqBody = {}) { + const { namespace = null } = reqBody; + const { client } = await this.connect(); + const details = await this.namespace(client, namespace); + await this.deleteVectorsInNamespace(client, namespace); + return { + message: `Namespace ${camelCase(namespace)} was deleted along with ${details?.vectorCount + } vectors.`, + }; + }, + reset: async function () { + const { client } = await this.connect(); + const weaviateClasses = await this.allNamespaces(client); + for (const weaviateClass of weaviateClasses) { + await client.schema.classDeleter().withClassName(weaviateClass).do(); + } + return { reset: true }; + }, + curateSources: function (sources = []) { + const documents = []; + for (const source of sources) { + if (Object.keys(source).length > 0) { + documents.push(source); + } + } + + return documents; + }, + flattenObjectForWeaviate: function (obj = {}) { + // Note this function is not generic, it is designed specifically for Weaviate + // https://weaviate.io/developers/weaviate/config-refs/datatypes#introduction + // Credit to LangchainJS + // https://github.com/hwchase17/langchainjs/blob/5485c4af50c063e257ad54f4393fa79e0aff6462/langchain/src/vectorstores/weaviate.ts#L11C1-L50C3 + const flattenedObject = {}; + + for (const key in obj) { + if (!Object.hasOwn(obj, key)) { + continue; + } + const value = obj[key]; + if (typeof obj[key] === "object" && !Array.isArray(value)) { + const recursiveResult = this.flattenObjectForWeaviate(value); + + for (const deepKey in recursiveResult) { + if (Object.hasOwn(obj, key)) { + flattenedObject[`${key}_${deepKey}`] = recursiveResult[deepKey]; + } + } + } else if (Array.isArray(value)) { + if ( + value.length > 0 && + typeof value[0] !== "object" && + // eslint-disable-next-line @typescript-eslint/no-explicit-any + value.every((el) => typeof el === typeof value[0]) + ) { + // Weaviate only supports arrays of primitive types, + // where all elements are of the same type + flattenedObject[key] = value; + } + } else { + flattenedObject[key] = value; + } + } + + return flattenedObject; + }, +}; + +module.exports.Weaviate = Weaviate; diff --git a/server/yarn.lock b/server/yarn.lock index cd1514e7..3b1caaa8 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -130,6 +130,11 @@ dependencies: googleapis-common "^6.0.3" +"@graphql-typed-document-node/core@^3.1.1": + version "3.2.0" + resolved "https://registry.yarnpkg.com/@graphql-typed-document-node/core/-/core-3.2.0.tgz#5f3d96ec6b2354ad6d8a28bf216a1d97b5426861" + integrity sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ== + "@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.10": version "1.0.11" resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa" @@ -916,6 +921,11 @@ extend@^3.0.2: resolved "https://registry.yarnpkg.com/extend/-/extend-3.0.2.tgz#f8b1136b4071fbd8eb140aff858b1019ec2915fa" integrity sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g== +extract-files@^9.0.0: + version "9.0.0" + resolved "https://registry.yarnpkg.com/extract-files/-/extract-files-9.0.0.tgz#8a7744f2437f81f5ed3250ed9f1550de902fe54a" + integrity sha512-CvdFfHkC95B4bBBk36hcEmvdR2awOdhhVUYH6S/zrVj3477zven/fJMYg7121h4T1xHZC+tetUpubpAhxwI7hQ== + extract-zip@^2.0.1: version "2.0.1" resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a" @@ -981,6 +991,15 @@ follow-redirects@^1.14.8: resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.15.2.tgz#b460864144ba63f2681096f274c4e57026da2c13" integrity sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA== +form-data@^3.0.0: + version "3.0.1" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-3.0.1.tgz#ebd53791b78356a99af9a300d4282c4d5eb9755f" + integrity sha512-RHkBKtLWUVwd7SqRIvCZMEvAMoGUp0XU+seQiZejj0COz3RI3hWP4sCv3gZWWLjJTd7rGwcsF5eKZGii0r/hbg== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + mime-types "^2.1.12" + form-data@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.0.tgz#93919daeaf361ee529584b9b31664dc12c9fa452" @@ -1149,6 +1168,21 @@ graceful-fs@^4.2.0, graceful-fs@^4.2.6: resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.11.tgz#4183e4e8bf08bb6e05bbb2f7d2e0c8f712ca40e3" integrity sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ== +graphql-request@^5.1.0: + version "5.2.0" + resolved "https://registry.yarnpkg.com/graphql-request/-/graphql-request-5.2.0.tgz#a05fb54a517d91bb2d7aefa17ade4523dc5ebdca" + integrity sha512-pLhKIvnMyBERL0dtFI3medKqWOz/RhHdcgbZ+hMMIb32mEPa5MJSzS4AuXxfI4sRAu6JVVk5tvXuGfCWl9JYWQ== + dependencies: + "@graphql-typed-document-node/core" "^3.1.1" + cross-fetch "^3.1.5" + extract-files "^9.0.0" + form-data "^3.0.0" + +graphql@^16.7.1: + version "16.7.1" + resolved "https://registry.yarnpkg.com/graphql/-/graphql-16.7.1.tgz#11475b74a7bff2aefd4691df52a0eca0abd9b642" + integrity sha512-DRYR9tf+UGU0KOsMcKAlXeFfX89UiiIZ0dRU3mR0yJfu6OjZqUcp68NnFLnqQU5RexygFoDy1EW+ccOYcPfmHg== + gtoken@^6.1.0: version "6.1.2" resolved "https://registry.yarnpkg.com/gtoken/-/gtoken-6.1.2.tgz#aeb7bdb019ff4c3ba3ac100bbe7b6e74dce0e8bc" @@ -2507,6 +2541,15 @@ vectordb@0.1.12: "@apache-arrow/ts" "^12.0.0" apache-arrow "^12.0.0" +weaviate-ts-client@^1.4.0: + version "1.4.0" + resolved "https://registry.yarnpkg.com/weaviate-ts-client/-/weaviate-ts-client-1.4.0.tgz#e1adb670f2c1930a82601efb915b0131f6988b7e" + integrity sha512-G2V/IWMHXDjoJeATUYKkZXzAs7iRj4GE8B3AX59XDqMRW12X7VUkRgo4xWcHH1bjpLIHUYTzD5qZXcB8P9Hdmw== + dependencies: + graphql-request "^5.1.0" + isomorphic-fetch "^3.0.0" + uuid "^9.0.0" + webidl-conversions@^3.0.0: version "3.0.1" resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871"