Implement Chroma Support (#1)

This commit is contained in:
Timothy Carambat 2023-06-07 21:31:35 -07:00 committed by GitHub
parent 6b48e812c5
commit 6d01970df2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1116 additions and 509 deletions

View File

@ -2,7 +2,7 @@
[![Twitter](https://img.shields.io/twitter/url/https/twitter.com/tim.svg?style=social&label=Follow%20%40Timothy%20Carambat)](https://twitter.com/tcarambat) [![](https://dcbadge.vercel.app/api/server/6UyHPeGZAC?compact=true&style=flat)](https://discord.gg/6UyHPeGZAC)
A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports Pinecone and OpenAI.
A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports Pinecone & ChromaDB for vector storage and OpenAI for chatting.
![Chatting](/images/screenshots/chat.png)
[view more screenshots](/images/screenshots/SCREENSHOTS.md)
@ -38,7 +38,7 @@ This monorepo consists of three main sections:
- `yarn` and `node` on your machine
- `python` 3.8+ for running scripts in `collector/`.
- access to an LLM like `GPT-3.5`, `GPT-4`*.
- a [Pinecone.io](https://pinecone.io) free account*.
- a [Pinecone.io](https://pinecone.io) free account* **or** Local Chroma instance running.
*you can use drop in replacements for these. This is just the easiest to get up and running fast.
### How to get started

View File

@ -16,9 +16,8 @@ export default function DefaultChatContainer() {
const MESSAGES = [
<React.Fragment>
<div
className={`flex w-full mt-2 justify-start ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-start ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-orange-100 dark:bg-stone-700 rounded-b-2xl rounded-tr-2xl rounded-tl-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
@ -34,9 +33,8 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-start ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-start ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-orange-100 dark:bg-stone-700 rounded-b-2xl rounded-tr-2xl rounded-tl-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
@ -51,17 +49,16 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-start ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-start ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-orange-100 dark:bg-stone-700 rounded-b-2xl rounded-tr-2xl rounded-tl-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
AnythingLLM can run totally locally on your machine with little
overhead you wont even notice it's there! No GPU needed. Cloud and
on-premises installtion is available as well.
on-premises installation is available as well.
<br />
The AI tooling ecosytem gets more powerful everyday. AnythingLLM
The AI tooling ecosystem gets more powerful everyday. AnythingLLM
makes it easy to use.
</p>
<a
@ -79,9 +76,8 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-end ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-end ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-slate-200 dark:bg-amber-800 rounded-b-2xl rounded-tl-2xl rounded-tr-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
@ -93,9 +89,8 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-start ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-start ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-orange-100 dark:bg-stone-700 rounded-b-2xl rounded-tr-2xl rounded-tl-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
@ -122,14 +117,13 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-end ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-end ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-slate-200 dark:bg-amber-800 rounded-b-2xl rounded-tl-2xl rounded-tr-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
Is this like an AI dropbox or something? What about chatting? It is
a chatbot isnt it?
a chatbot isn't it?
</p>
</div>
</div>
@ -137,9 +131,8 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-start ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-start ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-orange-100 dark:bg-stone-700 rounded-b-2xl rounded-tr-2xl rounded-tl-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
@ -168,9 +161,8 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-end ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-end ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-slate-200 dark:bg-amber-800 rounded-b-2xl rounded-tl-2xl rounded-tr-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">
@ -182,9 +174,8 @@ export default function DefaultChatContainer() {
<React.Fragment>
<div
className={`flex w-full mt-2 justify-start ${
popMsg ? "chat__message" : ""
}`}
className={`flex w-full mt-2 justify-start ${popMsg ? "chat__message" : ""
}`}
>
<div className="p-4 max-w-[75%] bg-orange-100 dark:bg-stone-700 rounded-b-2xl rounded-tr-2xl rounded-tl-sm">
<p className="text-slate-800 dark:text-slate-200 font-semibold">

View File

@ -74,20 +74,38 @@ export default function KeysModal({ hideModal = noop }) {
/>
<div className="h-[2px] w-full bg-gray-200 dark:bg-stone-600" />
<ShowKey
name="Pinecone DB API Key"
value={settings?.PineConeKey ? "*".repeat(20) : ""}
valid={!!settings?.PineConeKey}
/>
<ShowKey
name="Pinecone DB Environment"
value={settings?.PineConeEnvironment}
valid={!!settings?.PineConeEnvironment}
/>
<ShowKey
name="Pinecone DB Index"
value={settings?.PinceConeIndex}
valid={!!settings?.PinceConeIndex}
name="Vector DB Choice"
value={settings?.VectorDB}
valid={!!settings?.VectorDB}
/>
{settings?.VectorDB === "pinecone" && (
<>
<ShowKey
name="Pinecone DB API Key"
value={settings?.PineConeKey ? "*".repeat(20) : ""}
valid={!!settings?.PineConeKey}
/>
<ShowKey
name="Pinecone DB Environment"
value={settings?.PineConeEnvironment}
valid={!!settings?.PineConeEnvironment}
/>
<ShowKey
name="Pinecone DB Index"
value={settings?.PineConeIndex}
valid={!!settings?.PineConeIndex}
/>
</>
)}
{settings?.VectorDB === "chroma" && (
<>
<ShowKey
name="Chroma Endpoint"
value={settings?.ChromaEndpoint}
valid={!!settings?.ChromaEndpoint}
/>
</>
)}
</div>
)}
</div>

View File

@ -41,7 +41,7 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) {
const deleteWorkspace = async () => {
if (
!window.confirm(
`You are about to delete your entire ${workspace.name} workspace. This will remove all vector embeddings on your vector database.\n\nThe original source files will remiain untouched. This action is irreversible.`
`You are about to delete your entire ${workspace.name} workspace. This will remove all vector embeddings on your vector database.\n\nThe original source files will remain untouched. This action is irreversible.`
)
)
return false;

View File

@ -2,6 +2,7 @@ import { memo, useEffect, useRef, useState } from "react";
import { AlertTriangle } from "react-feather";
import Jazzicon from "../../../../UserIcon";
import { decode as HTMLDecode } from "he";
import { v4 } from "uuid";
function PromptReply({
uuid,

View File

@ -6,6 +6,7 @@
"author": "Timothy Carambat (Mintplex Labs)",
"license": "MIT",
"scripts": {
"lint": "cd server && yarn lint && cd .. && cd frontend && yarn lint",
"setup": "cd server && yarn && cd .. && yarn setup:envs && echo \"Please run yarn dev:server and yarn dev:frontend in separate terminal tabs.\"",
"setup:envs": "cd server && cp -n .env.example .env.development && cd ../collector && cp -n .env.example .env && cd ..",
"dev:server": "cd server && yarn dev",

View File

@ -1,8 +1,15 @@
SERVER_PORT=5000
OPEN_AI_KEY=
OPEN_MODEL_PREF='gpt-3.5-turbo'
# AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
CACHE_VECTORS="true"
# Enable all below if you are using vector database: Chroma.
# VECTOR_DB="chroma"
# CHROMA_ENDPOINT='http://localhost:8000'
# Enable all below if you are using vector database: Pinecone.
VECTOR_DB="pinecone"
PINECONE_ENVIRONMENT=
PINECONE_API_KEY=
PINECONE_INDEX=
AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
CACHE_VECTORS="true"
PINECONE_INDEX=

View File

@ -1 +1 @@
v18.12.1
v18.13.0

View File

@ -1,13 +1,13 @@
const { reqBody } = require('../utils/http');
const { Workspace } = require('../models/workspace');
const { chatWithWorkspace } = require('../utils/chats');
const { reqBody } = require("../utils/http");
const { Workspace } = require("../models/workspace");
const { chatWithWorkspace } = require("../utils/chats");
function chatEndpoints(app) {
if (!app) return;
app.post('/workspace/:slug/chat', async (request, response) => {
const { slug } = request.params
const { message, mode = 'query' } = reqBody(request)
app.post("/workspace/:slug/chat", async (request, response) => {
const { slug } = request.params;
const { message, mode = "query" } = reqBody(request);
const workspace = await Workspace.get(`slug = '${slug}'`);
if (!workspace) {
response.sendStatus(400).end();
@ -16,8 +16,7 @@ function chatEndpoints(app) {
const result = await chatWithWorkspace(workspace, message, mode);
response.status(200).json({ ...result });
})
});
}
module.exports = { chatEndpoints }
module.exports = { chatEndpoints };

View File

@ -1,34 +1,46 @@
require('dotenv').config({ path: `.env.${process.env.NODE_ENV}` })
const { Pinecone } = require('../utils/pinecone');
const { viewLocalFiles } = require('../utils/files');
require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` });
const { viewLocalFiles } = require("../utils/files");
const { getVectorDbClass } = require("../utils/helpers");
function systemEndpoints(app) {
if (!app) return;
app.get('/ping', (_, response) => {
app.get("/ping", (_, response) => {
response.sendStatus(200);
})
});
app.get('/setup-complete', (_, response) => {
app.get("/setup-complete", (_, response) => {
const vectorDB = process.env.VECTOR_DB || "pinecone";
const results = {
VectorDB: vectorDB,
OpenAiKey: !!process.env.OPEN_AI_KEY,
OpenAiModelPref: process.env.OPEN_MODEL_PREF || 'gpt-3.5-turbo',
PineConeEnvironment: process.env.PINECONE_ENVIRONMENT,
PineConeKey: !!process.env.PINECONE_API_KEY,
PinceConeIndex: process.env.PINECONE_INDEX,
}
response.status(200).json({ results })
})
OpenAiModelPref: process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo",
...(vectorDB === "pinecone"
? {
PineConeEnvironment: process.env.PINECONE_ENVIRONMENT,
PineConeKey: !!process.env.PINECONE_API_KEY,
PineConeIndex: process.env.PINECONE_INDEX,
}
: {}),
...(vectorDB === "chroma"
? {
ChromaEndpoint: process.env.CHROMA_ENDPOINT,
}
: {}),
};
response.status(200).json({ results });
});
app.get('/system-vectors', async (_, response) => {
const vectorCount = await Pinecone.totalIndicies();
response.status(200).json({ vectorCount })
})
app.get("/system-vectors", async (_, response) => {
const VectorDb = getVectorDbClass();
const vectorCount = await VectorDb.totalIndicies();
response.status(200).json({ vectorCount });
});
app.get('/local-files', async (_, response) => {
const localFiles = await viewLocalFiles()
response.status(200).json({ localFiles })
})
app.get("/local-files", async (_, response) => {
const localFiles = await viewLocalFiles();
response.status(200).json({ localFiles });
});
}
module.exports = { systemEndpoints }
module.exports = { systemEndpoints };

View File

@ -1,21 +1,21 @@
const { Pinecone } = require('../utils/pinecone');
const { reqBody } = require('../utils/http');
const { Workspace } = require('../models/workspace');
const { Document } = require('../models/documents');
const { DocumentVectors } = require('../models/vectors');
const { WorkspaceChats } = require('../models/workspaceChats');
const { convertToChatHistory } = require('../utils/chats');
const { reqBody } = require("../utils/http");
const { Workspace } = require("../models/workspace");
const { Document } = require("../models/documents");
const { DocumentVectors } = require("../models/vectors");
const { WorkspaceChats } = require("../models/workspaceChats");
const { convertToChatHistory } = require("../utils/chats");
const { getVectorDbClass } = require("../utils/helpers");
function workspaceEndpoints(app) {
if (!app) return;
app.post('/workspace/new', async (request, response) => {
app.post("/workspace/new", async (request, response) => {
const { name = null } = reqBody(request);
const { workspace, message } = await Workspace.new(name);
response.status(200).json({ workspace, message })
})
response.status(200).json({ workspace, message });
});
app.post('/workspace/:slug/update-embeddings', async (request, response) => {
app.post("/workspace/:slug/update-embeddings", async (request, response) => {
const { slug = null } = request.params;
const { adds = [], deletes = [] } = reqBody(request);
const currWorkspace = await Workspace.get(`slug = '${slug}'`);
@ -28,11 +28,12 @@ function workspaceEndpoints(app) {
await Document.removeDocuments(currWorkspace, deletes);
await Document.addDocuments(currWorkspace, adds);
const updatedWorkspace = await Workspace.get(`slug = '${slug}'`);
response.status(200).json({ workspace: updatedWorkspace })
})
response.status(200).json({ workspace: updatedWorkspace });
});
app.delete('/workspace/:slug', async (request, response) => {
const { slug = '' } = request.params
app.delete("/workspace/:slug", async (request, response) => {
const VectorDb = getVectorDbClass();
const { slug = "" } = request.params;
const workspace = await Workspace.get(`slug = '${slug}'`);
if (!workspace) {
@ -42,34 +43,38 @@ function workspaceEndpoints(app) {
await Workspace.delete(`slug = '${slug.toLowerCase()}'`);
await DocumentVectors.deleteForWorkspace(workspace.id);
await Document.delete(`workspaceId = ${Number(workspace.id)}`)
await WorkspaceChats.delete(`workspaceId = ${Number(workspace.id)}`)
try { await Pinecone['delete-namespace']({ namespace: slug }) } catch (e) { console.error(e.message) }
response.sendStatus(200).end()
})
await Document.delete(`workspaceId = ${Number(workspace.id)}`);
await WorkspaceChats.delete(`workspaceId = ${Number(workspace.id)}`);
try {
await VectorDb["delete-namespace"]({ namespace: slug });
} catch (e) {
console.error(e.message);
}
response.sendStatus(200).end();
});
app.get('/workspaces', async (_, response) => {
app.get("/workspaces", async (_, response) => {
const workspaces = await Workspace.where();
response.status(200).json({ workspaces })
})
response.status(200).json({ workspaces });
});
app.get('/workspace/:slug', async (request, response) => {
const { slug } = request.params
app.get("/workspace/:slug", async (request, response) => {
const { slug } = request.params;
const workspace = await Workspace.get(`slug = '${slug}'`);
response.status(200).json({ workspace })
})
response.status(200).json({ workspace });
});
app.get('/workspace/:slug/chats', async (request, response) => {
const { slug } = request.params
app.get("/workspace/:slug/chats", async (request, response) => {
const { slug } = request.params;
const workspace = await Workspace.get(`slug = '${slug}'`);
if (!workspace) {
response.sendStatus(400).end()
response.sendStatus(400).end();
return;
}
const history = await WorkspaceChats.forWorkspace(workspace.id)
response.status(200).json({ history: convertToChatHistory(history) })
})
const history = await WorkspaceChats.forWorkspace(workspace.id);
response.status(200).json({ history: convertToChatHistory(history) });
});
}
module.exports = { workspaceEndpoints }
module.exports = { workspaceEndpoints };

View File

@ -1,54 +1,62 @@
require('dotenv').config({ path: `.env.${process.env.NODE_ENV}` })
const express = require('express')
const bodyParser = require('body-parser')
const cors = require('cors');
const { validatedRequest } = require('./utils/middleware/validatedRequest');
const { Pinecone } = require('./utils/pinecone');
const { reqBody } = require('./utils/http');
const { systemEndpoints } = require('./endpoints/system');
const { workspaceEndpoints } = require('./endpoints/workspaces');
const { chatEndpoints } = require('./endpoints/chat');
require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` });
const express = require("express");
const bodyParser = require("body-parser");
const cors = require("cors");
const { validatedRequest } = require("./utils/middleware/validatedRequest");
const { reqBody } = require("./utils/http");
const { systemEndpoints } = require("./endpoints/system");
const { workspaceEndpoints } = require("./endpoints/workspaces");
const { chatEndpoints } = require("./endpoints/chat");
const { getVectorDbClass } = require("./utils/helpers");
const app = express();
app.use(cors({ origin: true }));
app.use(validatedRequest);
app.use(bodyParser.text());
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({
extended: true
}));
app.use(
bodyParser.urlencoded({
extended: true,
})
);
systemEndpoints(app);
workspaceEndpoints(app);
chatEndpoints(app);
app.post('/v/:command', async (request, response) => {
const { command } = request.params
if (!Object.getOwnPropertyNames(Pinecone).includes(command)) {
response.status(500).json({ message: 'invalid interface command', commands: Object.getOwnPropertyNames(Pinecone.prototype) });
return
app.post("/v/:command", async (request, response) => {
const VectorDb = getVectorDbClass();
const { command } = request.params;
if (!Object.getOwnPropertyNames(VectorDb).includes(command)) {
response.status(500).json({
message: "invalid interface command",
commands: Object.getOwnPropertyNames(VectorDb),
});
return;
}
try {
const body = reqBody(request);
const resBody = await Pinecone[command](body)
const resBody = await VectorDb[command](body);
response.status(200).json({ ...resBody });
} catch (e) {
// console.error(e)
console.error(JSON.stringify(e))
console.error(JSON.stringify(e));
response.status(500).json({ error: e.message });
}
return;
})
});
app.all('*', function (_, response) {
app.all("*", function (_, response) {
response.sendStatus(404);
});
app.listen(process.env.SERVER_PORT || 5000, () => {
console.log(`Example app listening on port ${process.env.SERVER_PORT || 5000}`)
})
app
.listen(process.env.SERVER_PORT || 5000, () => {
console.log(
`Example app listening on port ${process.env.SERVER_PORT || 5000}`
);
})
.on("error", function (err) {
process.once("SIGUSR2", function () {
process.kill(process.pid, "SIGUSR2");
@ -56,4 +64,4 @@ app.listen(process.env.SERVER_PORT || 5000, () => {
process.on("SIGINT", function () {
process.kill(process.pid, "SIGINT");
});
});
});

View File

@ -1,8 +1,9 @@
const { fileData } = require('../utils/files');
const { v4: uuidv4 } = require('uuid');
const { fileData } = require("../utils/files");
const { v4: uuidv4 } = require("uuid");
const { getVectorDbClass } = require("../utils/helpers");
const Document = {
tablename: 'workspace_documents',
tablename: "workspace_documents",
colsInit: `
id INTEGER PRIMARY KEY AUTOINCREMENT,
docId TEXT NOT NULL UNIQUE,
@ -14,64 +15,82 @@ const Document = {
lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP
`,
db: async function () {
const sqlite3 = require('sqlite3').verbose();
const { open } = require('sqlite');
const sqlite3 = require("sqlite3").verbose();
const { open } = require("sqlite");
const db = await open({
filename: 'anythingllm.db',
driver: sqlite3.Database
})
filename: "anythingllm.db",
driver: sqlite3.Database,
});
await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`);
db.on('trace', (sql) => console.log(sql))
return db
await db.exec(
`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`
);
db.on("trace", (sql) => console.log(sql));
return db;
},
forWorkspace: async function (workspaceId = null) {
if (!workspaceId) return [];
return await this.where(`workspaceId = ${workspaceId}`);
},
delete: async function (clause = '') {
const db = await this.db()
await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`)
db.close()
return true
delete: async function (clause = "") {
const db = await this.db();
await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`);
db.close();
return true;
},
where: async function (clause = '', limit = null) {
const db = await this.db()
const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''}`)
where: async function (clause = "", limit = null) {
const db = await this.db();
const results = await db.all(
`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${
!!limit ? `LIMIT ${limit}` : ""
}`
);
db.close()
return results
db.close();
return results;
},
firstWhere: async function (clause = '') {
firstWhere: async function (clause = "") {
const results = await this.where(clause);
return results.length > 0 ? results[0] : null
return results.length > 0 ? results[0] : null;
},
addDocuments: async function (workspace, additions = []) {
const { Pinecone } = require('../utils/pinecone');
const VectorDb = getVectorDbClass();
if (additions.length === 0) return;
const db = await this.db()
const stmt = await db.prepare(`INSERT INTO ${this.tablename} (docId, filename, docpath, workspaceId, metadata) VALUES (?,?,?,?,?)`)
const db = await this.db();
const stmt = await db.prepare(
`INSERT INTO ${this.tablename} (docId, filename, docpath, workspaceId, metadata) VALUES (?,?,?,?,?)`
);
for (const path of additions) {
const data = await fileData(path);
if (!data) continue;
const docId = uuidv4();
const { pageContent, ...metadata } = data
const { pageContent, ...metadata } = data;
const newDoc = {
docId,
filename: path.split('/')[1],
filename: path.split("/")[1],
docpath: path,
workspaceId: Number(workspace.id),
metadata: JSON.stringify(metadata)
}
const vectorized = await Pinecone.addDocumentToNamespace(workspace.slug, { ...data, docId }, path);
metadata: JSON.stringify(metadata),
};
const vectorized = await VectorDb.addDocumentToNamespace(
workspace.slug,
{ ...data, docId },
path
);
if (!vectorized) {
console.error('Failed to vectorize', path)
console.error("Failed to vectorize", path);
continue;
}
stmt.run([docId, newDoc.filename, newDoc.docpath, newDoc.workspaceId, newDoc.metadata])
stmt.run([
docId,
newDoc.filename,
newDoc.docpath,
newDoc.workspaceId,
newDoc.metadata,
]);
}
stmt.finalize();
db.close();
@ -79,21 +98,28 @@ const Document = {
return;
},
removeDocuments: async function (workspace, removals = []) {
const { Pinecone } = require('../utils/pinecone');
const VectorDb = getVectorDbClass();
if (removals.length === 0) return;
const db = await this.db()
const stmt = await db.prepare(`DELETE FROM ${this.tablename} WHERE docpath = ? AND workspaceId = ?`);
const db = await this.db();
const stmt = await db.prepare(
`DELETE FROM ${this.tablename} WHERE docpath = ? AND workspaceId = ?`
);
for (const path of removals) {
const document = await this.firstWhere(`docPath = '${path}' AND workspaceId = ${workspace.id}`)
const document = await this.firstWhere(
`docPath = '${path}' AND workspaceId = ${workspace.id}`
);
if (!document) continue;
await Pinecone.deleteDocumentFromNamespace(workspace.slug, document.docId);
stmt.run([path, workspace.id])
await VectorDb.deleteDocumentFromNamespace(
workspace.slug,
document.docId
);
stmt.run([path, workspace.id]);
}
stmt.finalize();
db.close();
return true;
}
}
},
};
module.exports = { Document }
module.exports = { Document };

View File

@ -1,10 +1,10 @@
const { Document } = require('./documents');
const { Document } = require("./documents");
// TODO: Do we want to store entire vectorized chunks in here
// so that we can easily spin up temp-namespace clones for threading
//
const DocumentVectors = {
tablename: 'document_vectors',
tablename: "document_vectors",
colsInit: `
id INTEGER PRIMARY KEY AUTOINCREMENT,
docId TEXT NOT NULL,
@ -13,51 +13,63 @@ const DocumentVectors = {
lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP
`,
db: async function () {
const sqlite3 = require('sqlite3').verbose();
const { open } = require('sqlite');
const sqlite3 = require("sqlite3").verbose();
const { open } = require("sqlite");
const db = await open({
filename: 'anythingllm.db',
driver: sqlite3.Database
})
filename: "anythingllm.db",
driver: sqlite3.Database,
});
await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`);
db.on('trace', (sql) => console.log(sql))
return db
await db.exec(
`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`
);
db.on("trace", (sql) => console.log(sql));
return db;
},
bulkInsert: async function (vectorRecords = []) {
if (vectorRecords.length === 0) return;
const db = await this.db();
const stmt = await db.prepare(`INSERT INTO ${this.tablename} (docId, vectorId) VALUES (?, ?)`);
const stmt = await db.prepare(
`INSERT INTO ${this.tablename} (docId, vectorId) VALUES (?, ?)`
);
for (const record of vectorRecords) {
const { docId, vectorId } = record
stmt.run([docId, vectorId])
const { docId, vectorId } = record;
stmt.run([docId, vectorId]);
}
stmt.finalize()
db.close()
stmt.finalize();
db.close();
return { documentsInserted: vectorRecords.length };
},
deleteForWorkspace: async function (workspaceId) {
const documents = await Document.forWorkspace(workspaceId);
const docIds = [...(new Set(documents.map((doc) => doc.docId)))];
const ids = (await this.where(`docId IN (${docIds.map((id) => `'${id}'`).join(',')})`)).map((doc) => doc.id)
await this.deleteIds(ids)
const docIds = [...new Set(documents.map((doc) => doc.docId))];
const ids = (
await this.where(`docId IN (${docIds.map((id) => `'${id}'`).join(",")})`)
).map((doc) => doc.id);
await this.deleteIds(ids);
return true;
},
where: async function (clause = '', limit = null) {
const db = await this.db()
const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''}`)
where: async function (clause = "", limit = null) {
const db = await this.db();
const results = await db.all(
`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${
!!limit ? `LIMIT ${limit}` : ""
}`
);
db.close()
return results
db.close();
return results;
},
deleteIds: async function (ids = []) {
const db = await this.db()
await db.get(`DELETE FROM ${this.tablename} WHERE id IN (${ids.join(', ')}) `)
db.close()
return true
}
}
const db = await this.db();
await db.get(
`DELETE FROM ${this.tablename} WHERE id IN (${ids.join(", ")}) `
);
db.close();
return true;
},
};
module.exports = { DocumentVectors }
module.exports = { DocumentVectors };

View File

@ -1,8 +1,8 @@
const slugify = require('slugify');
const { Document } = require('./documents');
const slugify = require("slugify");
const { Document } = require("./documents");
const Workspace = {
tablename: 'workspaces',
tablename: "workspaces",
colsInit: `
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
@ -12,52 +12,66 @@ const Workspace = {
lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP
`,
db: async function () {
const sqlite3 = require('sqlite3').verbose();
const { open } = require('sqlite');
const sqlite3 = require("sqlite3").verbose();
const { open } = require("sqlite");
const db = await open({
filename: 'anythingllm.db',
driver: sqlite3.Database
})
filename: "anythingllm.db",
driver: sqlite3.Database,
});
await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`);
db.on('trace', (sql) => console.log(sql))
return db
await db.exec(
`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`
);
db.on("trace", (sql) => console.log(sql));
return db;
},
new: async function (name = null) {
if (!name) return { result: null, message: 'name cannot be null' };
if (!name) return { result: null, message: "name cannot be null" };
const db = await this.db()
const { id, success, message } = await db.run(`INSERT INTO ${this.tablename} (name, slug) VALUES (?, ?)`, [name, slugify(name, { lower: true })])
const db = await this.db();
const { id, success, message } = await db
.run(`INSERT INTO ${this.tablename} (name, slug) VALUES (?, ?)`, [
name,
slugify(name, { lower: true }),
])
.then((res) => {
return { id: res.lastID, success: true, message: null }
return { id: res.lastID, success: true, message: null };
})
.catch((error) => {
return { id: null, success: false, message: error.message }
})
if (!success) return { workspace: null, message }
return { id: null, success: false, message: error.message };
});
if (!success) return { workspace: null, message };
const workspace = await db.get(`SELECT * FROM ${this.tablename} WHERE id = ${id}`)
return { workspace, message: null }
const workspace = await db.get(
`SELECT * FROM ${this.tablename} WHERE id = ${id}`
);
return { workspace, message: null };
},
get: async function (clause = '') {
const db = await this.db()
const result = await db.get(`SELECT * FROM ${this.tablename} WHERE ${clause}`).then((res) => res || null)
get: async function (clause = "") {
const db = await this.db();
const result = await db
.get(`SELECT * FROM ${this.tablename} WHERE ${clause}`)
.then((res) => res || null);
if (!result) return null;
const documents = await Document.forWorkspace(result.id);
return { ...result, documents }
return { ...result, documents };
},
delete: async function (clause = '') {
const db = await this.db()
await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`)
return true
delete: async function (clause = "") {
const db = await this.db();
await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`);
return true;
},
where: async function (clause = '', limit = null) {
const db = await this.db()
const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''}`)
return results
where: async function (clause = "", limit = null) {
const db = await this.db();
const results = await db.all(
`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${
!!limit ? `LIMIT ${limit}` : ""
}`
);
return results;
},
}
};
module.exports = { Workspace }
module.exports = { Workspace };

View File

@ -1,6 +1,5 @@
const WorkspaceChats = {
tablename: 'workspace_chats',
tablename: "workspace_chats",
colsInit: `
id INTEGER PRIMARY KEY AUTOINCREMENT,
workspaceId INTEGER NOT NULL,
@ -11,58 +10,79 @@ const WorkspaceChats = {
lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP
`,
db: async function () {
const sqlite3 = require('sqlite3').verbose();
const { open } = require('sqlite');
const sqlite3 = require("sqlite3").verbose();
const { open } = require("sqlite");
const db = await open({
filename: 'anythingllm.db',
driver: sqlite3.Database
})
filename: "anythingllm.db",
driver: sqlite3.Database,
});
await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`);
db.on('trace', (sql) => console.log(sql))
return db
await db.exec(
`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`
);
db.on("trace", (sql) => console.log(sql));
return db;
},
new: async function ({ workspaceId, prompt, response = {} }) {
const db = await this.db()
const { id, success, message } = await db.run(`INSERT INTO ${this.tablename} (workspaceId, prompt, response) VALUES (?, ?, ?)`, [workspaceId, prompt, JSON.stringify(response)])
const db = await this.db();
const { id, success, message } = await db
.run(
`INSERT INTO ${this.tablename} (workspaceId, prompt, response) VALUES (?, ?, ?)`,
[workspaceId, prompt, JSON.stringify(response)]
)
.then((res) => {
return { id: res.lastID, success: true, message: null }
return { id: res.lastID, success: true, message: null };
})
.catch((error) => {
return { id: null, success: false, message: error.message }
})
if (!success) return { chat: null, message }
return { id: null, success: false, message: error.message };
});
if (!success) return { chat: null, message };
const chat = await db.get(`SELECT * FROM ${this.tablename} WHERE id = ${id}`)
return { chat, message: null }
const chat = await db.get(
`SELECT * FROM ${this.tablename} WHERE id = ${id}`
);
return { chat, message: null };
},
forWorkspace: async function (workspaceId = null) {
if (!workspaceId) return [];
return await this.where(`workspaceId = ${workspaceId} AND include = true`, null, 'ORDER BY id ASC')
return await this.where(
`workspaceId = ${workspaceId} AND include = true`,
null,
"ORDER BY id ASC"
);
},
markHistoryInvalid: async function (workspaceId = null) {
if (!workspaceId) return;
const db = await this.db()
await db.run(`UPDATE ${this.tablename} SET include = false WHERE workspaceId = ?`, [workspaceId]);
const db = await this.db();
await db.run(
`UPDATE ${this.tablename} SET include = false WHERE workspaceId = ?`,
[workspaceId]
);
return;
},
get: async function (clause = '') {
const db = await this.db()
const result = await db.get(`SELECT * FROM ${this.tablename} WHERE ${clause}`).then((res) => res || null)
get: async function (clause = "") {
const db = await this.db();
const result = await db
.get(`SELECT * FROM ${this.tablename} WHERE ${clause}`)
.then((res) => res || null);
if (!result) return null;
return result
return result;
},
delete: async function (clause = '') {
const db = await this.db()
await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`)
return true
delete: async function (clause = "") {
const db = await this.db();
await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`);
return true;
},
where: async function (clause = '', limit = null, order = null) {
const db = await this.db()
const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''} ${!!order ? order : ''}`)
return results
where: async function (clause = "", limit = null, order = null) {
const db = await this.db();
const results = await db.all(
`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${
!!limit ? `LIMIT ${limit}` : ""
} ${!!order ? order : ""}`
);
return results;
},
}
};
module.exports = { WorkspaceChats }
module.exports = { WorkspaceChats };

View File

@ -10,17 +10,19 @@
"node": ">=18.12.1"
},
"scripts": {
"dev": "NODE_ENV=development nodemon --ignore documents index.js",
"start": "NODE_ENV=production node index.js"
"dev": "NODE_ENV=development nodemon --ignore documents --ignore vector-cache --trace-warnings index.js",
"start": "NODE_ENV=production node index.js",
"lint": "yarn prettier --write ./endpoints ./models ./utils index.js"
},
"dependencies": {
"@googleapis/youtube": "^9.0.0",
"@pinecone-database/pinecone": "^0.1.6",
"body-parser": "^1.20.2",
"chromadb": "^1.5.2",
"cors": "^2.8.5",
"dotenv": "^16.0.3",
"express": "^4.18.2",
"langchain": "^0.0.81",
"langchain": "^0.0.90",
"moment": "^2.29.4",
"openai": "^3.2.1",
"pinecone-client": "^1.1.0",
@ -30,6 +32,7 @@
"uuid": "^9.0.0"
},
"devDependencies": {
"nodemon": "^2.0.22"
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}

View File

@ -4,8 +4,8 @@ async function resetMemory(workspace, _message, msgUUID) {
await WorkspaceChats.markHistoryInvalid(workspace.id);
return {
uuid: msgUUID,
type: 'textResponse',
textResponse: 'Workspace chat memory was reset!',
type: "textResponse",
textResponse: "Workspace chat memory was reset!",
sources: [],
close: true,
error: false,
@ -13,5 +13,5 @@ async function resetMemory(workspace, _message, msgUUID) {
}
module.exports = {
resetMemory
}
resetMemory,
};

View File

@ -1,50 +1,49 @@
const { v4: uuidv4 } = require('uuid');
const { OpenAi } = require('../openAi');
const { Pinecone } = require('../pinecone');
const { WorkspaceChats } = require('../../models/workspaceChats');
const { v4: uuidv4 } = require("uuid");
const { OpenAi } = require("../openAi");
const { WorkspaceChats } = require("../../models/workspaceChats");
const { resetMemory } = require("./commands/reset");
const moment = require('moment')
const moment = require("moment");
const { getVectorDbClass } = require("../helpers");
function convertToChatHistory(history = []) {
const formattedHistory = []
const formattedHistory = [];
history.forEach((history) => {
const { prompt, response, createdAt } = history
const { prompt, response, createdAt } = history;
const data = JSON.parse(response);
formattedHistory.push([
{
role: 'user',
role: "user",
content: prompt,
sentAt: moment(createdAt).unix(),
},
{
role: 'assistant',
role: "assistant",
content: data.text,
sources: data.sources || [],
sentAt: moment(createdAt).unix(),
},
])
})
]);
});
return formattedHistory.flat()
return formattedHistory.flat();
}
function convertToPromptHistory(history = []) {
const formattedHistory = []
const formattedHistory = [];
history.forEach((history) => {
const { prompt, response } = history
const { prompt, response } = history;
const data = JSON.parse(response);
formattedHistory.push([
{ role: 'user', content: prompt },
{ role: 'assistant', content: data.text },
])
})
return formattedHistory.flat()
{ role: "user", content: prompt },
{ role: "assistant", content: data.text },
]);
});
return formattedHistory.flat();
}
const VALID_COMMANDS = {
'/reset': resetMemory,
}
"/reset": resetMemory,
};
function grepCommand(message) {
const availableCommands = Object.keys(VALID_COMMANDS);
@ -57,52 +56,63 @@ function grepCommand(message) {
}
}
return null
return null;
}
async function chatWithWorkspace(workspace, message, chatMode = 'query') {
async function chatWithWorkspace(workspace, message, chatMode = "query") {
const uuid = uuidv4();
const openai = new OpenAi();
const VectorDb = getVectorDbClass();
const command = grepCommand(message);
const command = grepCommand(message)
if (!!command && Object.keys(VALID_COMMANDS).includes(command)) {
return await VALID_COMMANDS[command](workspace, message, uuid);
}
const { safe, reasons = [] } = await openai.isSafe(message)
const { safe, reasons = [] } = await openai.isSafe(message);
if (!safe) {
return {
id: uuid,
type: 'abort',
type: "abort",
textResponse: null,
sources: [],
close: true,
error: `This message was moderated and will not be allowed. Violations for ${reasons.join(', ')} found.`
error: `This message was moderated and will not be allowed. Violations for ${reasons.join(
", "
)} found.`,
};
}
const hasVectorizedSpace = await Pinecone.hasNamespace(workspace.slug);
const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
if (!hasVectorizedSpace) {
const rawHistory = await WorkspaceChats.forWorkspace(workspace.id)
const rawHistory = await WorkspaceChats.forWorkspace(workspace.id);
const chatHistory = convertToPromptHistory(rawHistory);
const response = await openai.sendChat(chatHistory, message);
const data = { text: response, sources: [], type: 'chat' }
const data = { text: response, sources: [], type: "chat" };
await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: data })
await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: message,
response: data,
});
return {
id: uuid,
type: 'textResponse',
type: "textResponse",
textResponse: response,
sources: [],
close: true,
error: null,
};
} else {
const { response, sources, message: error } = await Pinecone[chatMode]({ namespace: workspace.slug, input: message });
const {
response,
sources,
message: error,
} = await VectorDb[chatMode]({ namespace: workspace.slug, input: message });
if (!response) {
return {
id: uuid,
type: 'abort',
type: "abort",
textResponse: null,
sources: [],
close: true,
@ -110,11 +120,15 @@ async function chatWithWorkspace(workspace, message, chatMode = 'query') {
};
}
const data = { text: response, sources, type: chatMode }
await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: data })
const data = { text: response, sources, type: chatMode };
await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: message,
response: data,
});
return {
id: uuid,
type: 'textResponse',
type: "textResponse",
textResponse: response,
sources,
close: true,
@ -124,5 +138,5 @@ async function chatWithWorkspace(workspace, message, chatMode = 'query') {
}
module.exports = {
convertToChatHistory,
chatWithWorkspace
}
chatWithWorkspace,
};

View File

@ -0,0 +1,24 @@
# How to setup a local (or remote) Chroma Vector Database
[Official Chroma Docs](https://docs.trychroma.com/usage-guide#running-chroma-in-clientserver-mode) for reference.
### How to get started
**Requirements**
- Docker
- `git` available in your CLI/terminal
**Instructions**
- `git clone git@github.com:chroma-core/chroma.git` to somewhere on computer.
- `cd chroma`
- `docker-compose up -d --build`
- set the `CHROMA_ENDPOINT=` .env variable in `server` and also set `VECTOR_DB=` to `chroma`.
eg: `server/.env.development`
```
VECTOR_DB="chroma"
CHROMA_ENDPOINT='http://localhost:8000'
```

View File

@ -0,0 +1,361 @@
const { ChromaClient, OpenAIEmbeddingFunction } = require("chromadb");
const { Chroma: ChromaStore } = require("langchain/vectorstores/chroma");
const { OpenAI } = require("langchain/llms/openai");
const { ChatOpenAI } = require("langchain/chat_models/openai");
const {
VectorDBQAChain,
LLMChain,
RetrievalQAChain,
ConversationalRetrievalQAChain,
} = require("langchain/chains");
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
// const { VectorStoreRetrieverMemory, BufferMemory } = require("langchain/memory");
// const { PromptTemplate } = require("langchain/prompts");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
const { storeVectorResult, cachedVectorInformation } = require("../files");
const { Configuration, OpenAIApi } = require("openai");
const { v4: uuidv4 } = require("uuid");
const toChunks = (arr, size) => {
return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) =>
arr.slice(i * size, i * size + size)
);
};
function curateSources(sources = []) {
const knownDocs = [];
const documents = [];
for (const source of sources) {
const { metadata = {} } = source;
if (
Object.keys(metadata).length > 0 &&
!knownDocs.includes(metadata.title)
) {
documents.push({ ...metadata });
knownDocs.push(metadata.title);
}
}
return documents;
}
const Chroma = {
name: 'Chroma',
connect: async function () {
const client = new ChromaClient({
path: process.env.CHROMA_ENDPOINT, // if not set will fallback to localhost:8000
});
const isAlive = await client.heartbeat();
if (!isAlive)
throw new Error(
"ChromaDB::Invalid Heartbeat received - is the instance online?"
);
return { client };
},
heartbeat: async function () {
const { client } = await this.connect();
return { heartbeat: await client.heartbeat() };
},
totalIndicies: async function () {
const { client } = await this.connect();
const collections = await client.listCollections();
var totalVectors = 0;
for (const collectionObj of collections) {
const collection = await client
.getCollection({ name: collectionObj.name })
.catch(() => null);
if (!collection) continue;
totalVectors += await collection.count();
}
return totalVectors;
},
embeddingFunc: function () {
return new OpenAIEmbeddingFunction({
openai_api_key: process.env.OPEN_AI_KEY,
});
},
embedder: function () {
return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY });
},
openai: function () {
const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY });
const openai = new OpenAIApi(config);
return openai;
},
llm: function () {
const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
return new OpenAI({
openAIApiKey: process.env.OPEN_AI_KEY,
temperature: 0.7,
modelName: model,
});
},
chatLLM: function () {
const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
return new ChatOpenAI({
openAIApiKey: process.env.OPEN_AI_KEY,
temperature: 0.7,
modelName: model,
});
},
embedChunk: async function (openai, textChunk) {
const {
data: { data },
} = await openai.createEmbedding({
model: "text-embedding-ada-002",
input: textChunk,
});
return data.length > 0 && data[0].hasOwnProperty("embedding")
? data[0].embedding
: null;
},
namespace: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client
.getCollection({ name: namespace })
.catch(() => null);
if (!collection) return null;
return {
...collection,
vectorCount: await collection.count(),
};
},
hasNamespace: async function (namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
return await this.namespaceExists(client, namespace);
},
namespaceExists: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client
.getCollection({ name: namespace })
.catch((e) => {
console.error("ChromaDB::namespaceExists", e.message);
return null;
});
return !!collection;
},
deleteVectorsInNamespace: async function (client, namespace = null) {
await client.deleteCollection({ name: namespace });
return true;
},
addDocumentToNamespace: async function (
namespace,
documentData = {},
fullFilePath = null
) {
const { DocumentVectors } = require("../../models/vectors");
try {
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
const { client } = await this.connect();
const collection = await client.getOrCreateCollection({
name: namespace,
metadata: { "hnsw:space": "cosine" },
embeddingFunction: this.embeddingFunc(),
});
const { chunks } = cacheResult;
const documentVectors = [];
for (const chunk of chunks) {
const submission = {
ids: [],
embeddings: [],
metadatas: [],
documents: [],
};
// Before sending to Chroma and saving the records to our db
// we need to assign the id of each chunk that is stored in the cached file.
chunk.forEach((chunk) => {
const id = uuidv4();
const { id: _id, ...metadata } = chunk.metadata;
documentVectors.push({ docId, vectorId: id });
submission.ids.push(id);
submission.embeddings.push(chunk.values);
submission.metadatas.push(metadata);
submission.documents.push(metadata.text);
});
const additionResult = await collection.add(submission);
if (!additionResult)
throw new Error("Error embedding into ChromaDB", additionResult);
}
await DocumentVectors.bulkInsert(documentVectors);
return true;
}
// If we are here then we are going to embed and store a novel document.
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
// because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb.
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 20,
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Chunks created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const openai = this.openai();
const submission = {
ids: [],
embeddings: [],
metadatas: [],
documents: [],
};
for (const textChunk of textChunks) {
const vectorValues = await this.embedChunk(openai, textChunk);
if (!!vectorValues) {
const vectorRecord = {
id: uuidv4(),
values: vectorValues,
// [DO NOT REMOVE]
// LangChain will be unable to find your text if you embed manually and dont include the `text` key.
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64
metadata: { ...metadata, text: textChunk },
};
submission.ids.push(vectorRecord.id);
submission.embeddings.push(vectorRecord.values);
submission.metadatas.push(metadata);
submission.documents.push(textChunk);
vectors.push(vectorRecord);
documentVectors.push({ docId, vectorId: vectorRecord.id });
} else {
console.error(
"Could not use OpenAI to embed document chunk! This document will not be recorded."
);
}
}
const { client } = await this.connect();
const collection = await client.getOrCreateCollection({
name: namespace,
metadata: { "hnsw:space": "cosine" },
embeddingFunction: this.embeddingFunc(),
});
if (vectors.length > 0) {
const chunks = [];
console.log("Inserting vectorized chunks into Chroma collection.");
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
const additionResult = await collection.add(submission);
if (!additionResult)
throw new Error("Error embedding into ChromaDB", additionResult);
await storeVectorResult(chunks, fullFilePath);
}
await DocumentVectors.bulkInsert(documentVectors);
return true;
} catch (e) {
console.error("addDocumentToNamespace", e.message);
return false;
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
const { DocumentVectors } = require("../../models/vectors");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) return;
const collection = await client.getCollection({
name: namespace,
embeddingFunction: this.embeddingFunc(),
});
const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`);
if (knownDocuments.length === 0) return;
const vectorIds = knownDocuments.map((doc) => doc.vectorId);
await collection.delete({ ids: vectorIds });
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes);
return true;
},
query: async function (reqBody = {}) {
const { namespace = null, input } = reqBody;
if (!namespace || !input) throw new Error("Invalid request body");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) {
return {
response: null,
sources: [],
message: "Invalid query - no documents found for workspace!",
};
}
// const collection = await client.getCollection({ name: namespace, embeddingFunction: this.embeddingFunc() })
// const results = await collection.get({
// where: {
// description: 'a custom file uploaded by the user.'
// },
// includes: ['ids']
// })
// console.log(results)
// return { response: null, sources: [], }
const vectorStore = await ChromaStore.fromExistingCollection(
this.embedder(),
{ collectionName: namespace, url: process.env.CHROMA_ENDPOINT }
);
const model = this.llm();
const chain = VectorDBQAChain.fromLLM(model, vectorStore, {
k: 5,
returnSourceDocuments: true,
});
const response = await chain.call({ query: input });
return {
response: response.text,
sources: curateSources(response.sourceDocuments),
message: false,
};
},
"namespace-stats": async function (reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
throw new Error("Namespace by that name does not exist.");
const stats = await this.namespace(client, namespace);
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
throw new Error("Namespace by that name does not exist.");
const details = await this.namespace(client, namespace);
await this.deleteVectorsInNamespace(client, namespace);
return {
message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`,
};
},
reset: async function () {
const { client } = await this.connect();
await client.reset();
return { reset: true };
},
};
module.exports = {
Chroma,
};

View File

@ -1,21 +1,24 @@
const fs = require("fs")
const path = require('path');
const { v5: uuidv5 } = require('uuid');
const fs = require("fs");
const path = require("path");
const { v5: uuidv5 } = require("uuid");
async function collectDocumentData(folderName = null) {
if (!folderName) throw new Error('No docPath provided in request');
const folder = path.resolve(__dirname, `../../documents/${folderName}`)
if (!folderName) throw new Error("No docPath provided in request");
const folder = path.resolve(__dirname, `../../documents/${folderName}`);
const dirExists = fs.existsSync(folder);
if (!dirExists) throw new Error(`No documents folder for ${folderName} - did you run collector/main.py for this element?`);
if (!dirExists)
throw new Error(
`No documents folder for ${folderName} - did you run collector/main.py for this element?`
);
const files = fs.readdirSync(folder);
const fileData = [];
files.forEach(file => {
if (path.extname(file) === '.json') {
files.forEach((file) => {
if (path.extname(file) === ".json") {
const filePath = path.join(folder, file);
const data = fs.readFileSync(filePath, 'utf8');
const data = fs.readFileSync(filePath, "utf8");
console.log(`Parsing document: ${file}`);
fileData.push(JSON.parse(data))
fileData.push(JSON.parse(data));
}
});
return fileData;
@ -24,75 +27,78 @@ async function collectDocumentData(folderName = null) {
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
if (!filePath) throw new Error('No docPath provided in request');
const fullPath = path.resolve(__dirname, `../../documents/${filePath}`)
if (!filePath) throw new Error("No docPath provided in request");
const fullPath = path.resolve(__dirname, `../../documents/${filePath}`);
const fileExists = fs.existsSync(fullPath);
if (!fileExists) return null;
const data = fs.readFileSync(fullPath, 'utf8');
return JSON.parse(data)
const data = fs.readFileSync(fullPath, "utf8");
return JSON.parse(data);
}
async function viewLocalFiles() {
const folder = path.resolve(__dirname, `../../documents`)
const folder = path.resolve(__dirname, `../../documents`);
const dirExists = fs.existsSync(folder);
if (!dirExists) return {}
if (!dirExists) return {};
const directory = {
name: "documents",
type: "folder",
items: [],
}
};
for (const file of fs.readdirSync(folder)) {
if (path.extname(file) === '.md') continue;
const folderPath = path.resolve(__dirname, `../../documents/${file}`)
const isFolder = fs.lstatSync(folderPath).isDirectory()
if (path.extname(file) === ".md") continue;
const folderPath = path.resolve(__dirname, `../../documents/${file}`);
const isFolder = fs.lstatSync(folderPath).isDirectory();
if (isFolder) {
const subdocs = {
name: file,
type: "folder",
items: [],
}
};
const subfiles = fs.readdirSync(folderPath);
for (const subfile of subfiles) {
if (path.extname(subfile) !== '.json') continue;
if (path.extname(subfile) !== ".json") continue;
const filePath = path.join(folderPath, subfile);
const rawData = fs.readFileSync(filePath, 'utf8');
const cachefilename = `${file}/${subfile}`
const { pageContent, ...metadata } = JSON.parse(rawData)
const rawData = fs.readFileSync(filePath, "utf8");
const cachefilename = `${file}/${subfile}`;
const { pageContent, ...metadata } = JSON.parse(rawData);
subdocs.items.push({
name: subfile,
type: "file",
...metadata,
cached: await cachedVectorInformation(cachefilename, true)
})
cached: await cachedVectorInformation(cachefilename, true),
});
}
directory.items.push(subdocs)
directory.items.push(subdocs);
}
};
}
return directory
return directory;
}
// Searches the vector-cache folder for existing information so we dont have to re-embed a
// document and can instead push directly to vector db.
async function cachedVectorInformation(filename = null, checkOnly = false) {
if (!process.env.CACHE_VECTORS) return checkOnly ? false : { exists: false, chunks: [] };
if (!process.env.CACHE_VECTORS)
return checkOnly ? false : { exists: false, chunks: [] };
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
const digest = uuidv5(filename, uuidv5.URL);
const file = path.resolve(__dirname, `../../vector-cache/${digest}.json`);
const exists = fs.existsSync(file);
if (checkOnly) return exists
if (!exists) return { exists, chunks: [] }
if (checkOnly) return exists;
if (!exists) return { exists, chunks: [] };
console.log(`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`)
const rawData = fs.readFileSync(file, 'utf8');
return { exists: true, chunks: JSON.parse(rawData) }
console.log(
`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
);
const rawData = fs.readFileSync(file, "utf8");
return { exists: true, chunks: JSON.parse(rawData) };
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
@ -100,14 +106,16 @@ async function cachedVectorInformation(filename = null, checkOnly = false) {
async function storeVectorResult(vectorData = [], filename = null) {
if (!process.env.CACHE_VECTORS) return;
if (!filename) return;
console.log(`Caching vectorized results of ${filename} to prevent duplicated embedding.`)
console.log(
`Caching vectorized results of ${filename} to prevent duplicated embedding.`
);
const folder = path.resolve(__dirname, `../../vector-cache`);
if (!fs.existsSync(folder)) fs.mkdirSync(folder);
const digest = uuidv5(filename, uuidv5.URL);
const writeTo = path.resolve(folder, `${digest}.json`);
fs.writeFileSync(writeTo, JSON.stringify(vectorData), 'utf8');
fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
return;
}
@ -116,5 +124,5 @@ module.exports = {
collectDocumentData,
viewLocalFiles,
storeVectorResult,
fileData
}
fileData,
};

View File

@ -0,0 +1,18 @@
const { Pinecone } = require("../pinecone");
const { Chroma } = require("../chroma");
function getVectorDbClass() {
const vectorSelection = process.env.VECTOR_DB || "pinecone";
switch (vectorSelection) {
case "pinecone":
return Pinecone;
case "chroma":
return Chroma;
default:
return Pinecone;
}
}
module.exports = {
getVectorDbClass,
};

View File

@ -1,5 +1,5 @@
function reqBody(request) {
return typeof request.body === 'string'
return typeof request.body === "string"
? JSON.parse(request.body)
: request.body;
}

View File

@ -1,30 +1,30 @@
function validatedRequest(request, response, next) {
// When in development passthrough auth token for ease of development.
if (process.env.NODE_ENV === 'development' || !process.env.AUTH_TOKEN) {
if (process.env.NODE_ENV === "development" || !process.env.AUTH_TOKEN) {
next();
return;
}
if (!process.env.AUTH_TOKEN) {
response.status(403).json({
error: "You need to set an AUTH_TOKEN environment variable."
error: "You need to set an AUTH_TOKEN environment variable.",
});
return;
}
const auth = request.header('Authorization');
const token = auth ? auth.split(' ')[1] : null;
const auth = request.header("Authorization");
const token = auth ? auth.split(" ")[1] : null;
if (!token) {
response.status(403).json({
error: "No auth token found."
error: "No auth token found.",
});
return;
}
if (token !== process.env.AUTH_TOKEN) {
response.status(403).json({
error: "Invalid auth token found."
error: "Invalid auth token found.",
});
return;
}
@ -34,4 +34,4 @@ function validatedRequest(request, response, next) {
module.exports = {
validatedRequest,
};
};

View File

@ -1,64 +1,76 @@
const { Configuration, OpenAIApi } = require('openai')
const { Configuration, OpenAIApi } = require("openai");
class OpenAi {
constructor() {
const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY, organization: 'org-amIuvAIIcdUmN5YCiwRayVfb' })
const config = new Configuration({
apiKey: process.env.OPEN_AI_KEY,
organization: "org-amIuvAIIcdUmN5YCiwRayVfb",
});
const openai = new OpenAIApi(config);
this.openai = openai
this.openai = openai;
}
isValidChatModel(modelName = '') {
const validModels = ['gpt-4', 'gpt-3.5-turbo']
return validModels.includes(modelName)
isValidChatModel(modelName = "") {
const validModels = ["gpt-4", "gpt-3.5-turbo"];
return validModels.includes(modelName);
}
async isSafe(input = '') {
const { flagged = false, categories = {} } = await this.openai.createModeration({ input })
async isSafe(input = "") {
const { flagged = false, categories = {} } = await this.openai
.createModeration({ input })
.then((json) => {
const res = json.data;
if (!res.hasOwnProperty('results')) throw new Error('OpenAI moderation: No results!');
if (res.results.length === 0) throw new Error('OpenAI moderation: No results length!');
return res.results[0]
})
if (!res.hasOwnProperty("results"))
throw new Error("OpenAI moderation: No results!");
if (res.results.length === 0)
throw new Error("OpenAI moderation: No results length!");
return res.results[0];
});
if (!flagged) return { safe: true, reasons: [] };
const reasons = Object.keys(categories).map((category) => {
const value = categories[category]
if (value === true) {
return category.replace('/', ' or ');
} else {
return null;
}
}).filter((reason) => !!reason)
const reasons = Object.keys(categories)
.map((category) => {
const value = categories[category];
if (value === true) {
return category.replace("/", " or ");
} else {
return null;
}
})
.filter((reason) => !!reason);
return { safe: false, reasons }
return { safe: false, reasons };
}
async sendChat(chatHistory = [], prompt) {
const model = process.env.OPEN_MODEL_PREF
if (!this.isValidChatModel(model)) throw new Error(`OpenAI chat: ${model} is not valid for chat completion!`);
const model = process.env.OPEN_MODEL_PREF;
if (!this.isValidChatModel(model))
throw new Error(
`OpenAI chat: ${model} is not valid for chat completion!`
);
const textResponse = await this.openai.createChatCompletion({
model,
temperature: 0.7,
n: 1,
messages: [
{ role: 'system', content: '' },
...chatHistory,
{ role: 'user', content: prompt },
]
})
.then((json) => {
const res = json.data
if (!res.hasOwnProperty('choices')) throw new Error('OpenAI chat: No results!');
if (res.choices.length === 0) throw new Error('OpenAI chat: No results length!');
return res.choices[0].message.content
const textResponse = await this.openai
.createChatCompletion({
model,
temperature: 0.7,
n: 1,
messages: [
{ role: "system", content: "" },
...chatHistory,
{ role: "user", content: prompt },
],
})
.then((json) => {
const res = json.data;
if (!res.hasOwnProperty("choices"))
throw new Error("OpenAI chat: No results!");
if (res.choices.length === 0)
throw new Error("OpenAI chat: No results length!");
return res.choices[0].message.content;
});
return textResponse
return textResponse;
}
}
module.exports = {
OpenAi,
};

View File

@ -1,30 +1,41 @@
const { PineconeClient } = require("@pinecone-database/pinecone");
const { PineconeStore } = require("langchain/vectorstores/pinecone");
const { OpenAI } = require("langchain/llms/openai");
const { ChatOpenAI } = require('langchain/chat_models/openai');
const { VectorDBQAChain, LLMChain, RetrievalQAChain, ConversationalRetrievalQAChain } = require("langchain/chains");
const { ChatOpenAI } = require("langchain/chat_models/openai");
const {
VectorDBQAChain,
LLMChain,
RetrievalQAChain,
ConversationalRetrievalQAChain,
} = require("langchain/chains");
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
const { VectorStoreRetrieverMemory, BufferMemory } = require("langchain/memory");
const {
VectorStoreRetrieverMemory,
BufferMemory,
} = require("langchain/memory");
const { PromptTemplate } = require("langchain/prompts");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
const { storeVectorResult, cachedVectorInformation } = require('../files');
const { Configuration, OpenAIApi } = require('openai')
const { v4: uuidv4 } = require('uuid');
const { storeVectorResult, cachedVectorInformation } = require("../files");
const { Configuration, OpenAIApi } = require("openai");
const { v4: uuidv4 } = require("uuid");
const toChunks = (arr, size) => {
return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) =>
arr.slice(i * size, i * size + size)
);
}
};
function curateSources(sources = []) {
const knownDocs = [];
const documents = []
const documents = [];
for (const source of sources) {
const { metadata = {} } = source
if (Object.keys(metadata).length > 0 && !knownDocs.includes(metadata.title)) {
documents.push({ ...metadata })
knownDocs.push(metadata.title)
const { metadata = {} } = source;
if (
Object.keys(metadata).length > 0 &&
!knownDocs.includes(metadata.title)
) {
documents.push({ ...metadata });
knownDocs.push(metadata.title);
}
}
@ -32,6 +43,7 @@ function curateSources(sources = []) {
}
const Pinecone = {
name: 'Pinecone',
connect: async function () {
const client = new PineconeClient();
await client.init({
@ -39,91 +51,112 @@ const Pinecone = {
environment: process.env.PINECONE_ENVIRONMENT,
});
const pineconeIndex = client.Index(process.env.PINECONE_INDEX);
const { status } = await client.describeIndex({ indexName: process.env.PINECONE_INDEX });
const { status } = await client.describeIndex({
indexName: process.env.PINECONE_INDEX,
});
if (!status.ready) throw new Error("Pinecode::Index not ready.")
if (!status.ready) throw new Error("Pinecode::Index not ready.");
return { client, pineconeIndex, indexName: process.env.PINECONE_INDEX };
},
embedder: function () {
return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY });
},
openai: function () {
const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY })
const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY });
const openai = new OpenAIApi(config);
return openai
return openai;
},
embedChunk: async function (openai, textChunk) {
const { data: { data } } = await openai.createEmbedding({
model: 'text-embedding-ada-002',
input: textChunk
})
return data.length > 0 && data[0].hasOwnProperty('embedding') ? data[0].embedding : null
const {
data: { data },
} = await openai.createEmbedding({
model: "text-embedding-ada-002",
input: textChunk,
});
return data.length > 0 && data[0].hasOwnProperty("embedding")
? data[0].embedding
: null;
},
llm: function () {
const model = process.env.OPEN_MODEL_PREF || 'gpt-3.5-turbo'
return new OpenAI({ openAIApiKey: process.env.OPEN_AI_KEY, temperature: 0.7, modelName: model });
const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
return new OpenAI({
openAIApiKey: process.env.OPEN_AI_KEY,
temperature: 0.7,
modelName: model,
});
},
chatLLM: function () {
const model = process.env.OPEN_MODEL_PREF || 'gpt-3.5-turbo'
return new ChatOpenAI({ openAIApiKey: process.env.OPEN_AI_KEY, temperature: 0.7, modelName: model });
const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
return new ChatOpenAI({
openAIApiKey: process.env.OPEN_AI_KEY,
temperature: 0.7,
modelName: model,
});
},
totalIndicies: async function () {
const { pineconeIndex } = await this.connect();
const { namespaces } = await pineconeIndex.describeIndexStats1();
return Object.values(namespaces).reduce((a, b) => a + (b?.vectorCount || 0), 0)
return Object.values(namespaces).reduce(
(a, b) => a + (b?.vectorCount || 0),
0
);
},
namespace: async function (index, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const { namespaces } = await index.describeIndexStats1();
return namespaces.hasOwnProperty(namespace) ? namespaces[namespace] : null
return namespaces.hasOwnProperty(namespace) ? namespaces[namespace] : null;
},
hasNamespace: async function (namespace = null) {
if (!namespace) return false;
const { pineconeIndex } = await this.connect();
return await this.namespaceExists(pineconeIndex, namespace)
return await this.namespaceExists(pineconeIndex, namespace);
},
namespaceExists: async function (index, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const { namespaces } = await index.describeIndexStats1();
return namespaces.hasOwnProperty(namespace)
return namespaces.hasOwnProperty(namespace);
},
deleteVectorsInNamespace: async function (index, namespace = null) {
await index.delete1({ namespace, deleteAll: true })
return true
await index.delete1({ namespace, deleteAll: true });
return true;
},
addDocumentToNamespace: async function (namespace, documentData = {}, fullFilePath = null) {
addDocumentToNamespace: async function (
namespace,
documentData = {},
fullFilePath = null
) {
const { DocumentVectors } = require("../../models/vectors");
try {
const { pageContent, docId, ...metadata } = documentData
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
const cacheResult = await cachedVectorInformation(fullFilePath)
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
const { pineconeIndex } = await this.connect();
const { chunks } = cacheResult
const documentVectors = []
const { chunks } = cacheResult;
const documentVectors = [];
for (const chunk of chunks) {
// Before sending to Pinecone and saving the records to our db
// we need to assign the id of each chunk that is stored in the cached file.
const newChunks = chunk.map((chunk) => {
const id = uuidv4()
const id = uuidv4();
documentVectors.push({ docId, vectorId: id });
return { ...chunk, id }
})
return { ...chunk, id };
});
// Push chunks with new ids to pinecone.
await pineconeIndex.upsert({
upsertRequest: {
vectors: [...newChunks],
namespace,
}
})
},
});
}
await DocumentVectors.bulkInsert(documentVectors)
return true
await DocumentVectors.bulkInsert(documentVectors);
return true;
}
// If we are here then we are going to embed and store a novel document.
@ -131,13 +164,16 @@ const Pinecone = {
// because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb.
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 20 });
const textChunks = await textSplitter.splitText(pageContent)
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 20,
});
const textChunks = await textSplitter.splitText(pageContent);
console.log('Chunks created from document:', textChunks.length)
const documentVectors = []
const vectors = []
const openai = this.openai()
console.log("Chunks created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const openai = this.openai();
for (const textChunk of textChunks) {
const vectorValues = await this.embedChunk(openai, textChunk);
@ -149,87 +185,97 @@ const Pinecone = {
// LangChain will be unable to find your text if you embed manually and dont include the `text` key.
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64
metadata: { ...metadata, text: textChunk },
}
};
vectors.push(vectorRecord);
documentVectors.push({ docId, vectorId: vectorRecord.id });
} else {
console.error('Could not use OpenAI to embed document chunk! This document will not be recorded.')
console.error(
"Could not use OpenAI to embed document chunk! This document will not be recorded."
);
}
}
if (vectors.length > 0) {
const chunks = []
const chunks = [];
const { pineconeIndex } = await this.connect();
console.log('Inserting vectorized chunks into Pinecone.')
console.log("Inserting vectorized chunks into Pinecone.");
for (const chunk of toChunks(vectors, 100)) {
chunks.push(chunk)
chunks.push(chunk);
await pineconeIndex.upsert({
upsertRequest: {
vectors: [...chunk],
namespace,
}
})
},
});
}
await storeVectorResult(chunks, fullFilePath)
await storeVectorResult(chunks, fullFilePath);
}
await DocumentVectors.bulkInsert(documentVectors)
await DocumentVectors.bulkInsert(documentVectors);
return true;
} catch (e) {
console.error('addDocumentToNamespace', e.message)
console.error("addDocumentToNamespace", e.message);
return false;
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
const { DocumentVectors } = require("../../models/vectors");
const { pineconeIndex } = await this.connect();
if (!await this.namespaceExists(pineconeIndex, namespace)) return;
if (!(await this.namespaceExists(pineconeIndex, namespace))) return;
const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`)
const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`);
if (knownDocuments.length === 0) return;
const vectorIds = knownDocuments.map((doc) => doc.vectorId);
await pineconeIndex.delete1({
ids: vectorIds,
namespace,
})
});
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes)
await DocumentVectors.deleteIds(indexes);
return true;
},
'namespace-stats': async function (reqBody = {}) {
const { namespace = null } = reqBody
"namespace-stats": async function (reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { pineconeIndex } = await this.connect();
if (!await this.namespaceExists(pineconeIndex, namespace)) throw new Error('Namespace by that name does not exist.');
const stats = await this.namespace(pineconeIndex, namespace)
return stats ? stats : { message: 'No stats were able to be fetched from DB' }
if (!(await this.namespaceExists(pineconeIndex, namespace)))
throw new Error("Namespace by that name does not exist.");
const stats = await this.namespace(pineconeIndex, namespace);
return stats
? stats
: { message: "No stats were able to be fetched from DB" };
},
'delete-namespace': async function (reqBody = {}) {
const { namespace = null } = reqBody
"delete-namespace": async function (reqBody = {}) {
const { namespace = null } = reqBody;
const { pineconeIndex } = await this.connect();
if (!await this.namespaceExists(pineconeIndex, namespace)) throw new Error('Namespace by that name does not exist.');
if (!(await this.namespaceExists(pineconeIndex, namespace)))
throw new Error("Namespace by that name does not exist.");
const details = await this.namespace(pineconeIndex, namespace);
await this.deleteVectorsInNamespace(pineconeIndex, namespace);
return { message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.` }
return {
message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`,
};
},
query: async function (reqBody = {}) {
const { namespace = null, input } = reqBody;
if (!namespace || !input) throw new Error("Invalid request body");
const { pineconeIndex } = await this.connect();
if (!await this.namespaceExists(pineconeIndex, namespace)) {
if (!(await this.namespaceExists(pineconeIndex, namespace))) {
return {
response: null, sources: [], message: 'Invalid query - no documents found for workspace!'
}
response: null,
sources: [],
message: "Invalid query - no documents found for workspace!",
};
}
const vectorStore = await PineconeStore.fromExistingIndex(
this.embedder(),
{ pineconeIndex, namespace }
);
const vectorStore = await PineconeStore.fromExistingIndex(this.embedder(), {
pineconeIndex,
namespace,
});
const model = this.llm();
const chain = VectorDBQAChain.fromLLM(model, vectorStore, {
@ -237,7 +283,11 @@ const Pinecone = {
returnSourceDocuments: true,
});
const response = await chain.call({ query: input });
return { response: response.text, sources: curateSources(response.sourceDocuments), message: false }
return {
response: response.text,
sources: curateSources(response.sourceDocuments),
message: false,
};
},
// This implementation of chat also expands the memory of the chat itself
// and adds more tokens to the PineconeDB instance namespace
@ -246,12 +296,15 @@ const Pinecone = {
if (!namespace || !input) throw new Error("Invalid request body");
const { pineconeIndex } = await this.connect();
if (!await this.namespaceExists(pineconeIndex, namespace)) throw new Error("Invalid namespace - has it been collected and seeded yet?");
if (!(await this.namespaceExists(pineconeIndex, namespace)))
throw new Error(
"Invalid namespace - has it been collected and seeded yet?"
);
const vectorStore = await PineconeStore.fromExistingIndex(
this.embedder(),
{ pineconeIndex, namespace }
);
const vectorStore = await PineconeStore.fromExistingIndex(this.embedder(), {
pineconeIndex,
namespace,
});
const memory = new VectorStoreRetrieverMemory({
vectorStoreRetriever: vectorStore.asRetriever(1),
@ -270,10 +323,10 @@ const Pinecone = {
const chain = new LLMChain({ llm: model, prompt, memory });
const response = await chain.call({ input });
return { response: response.text, sources: [], message: false }
return { response: response.text, sources: [], message: false };
},
}
};
module.exports = {
Pinecone
}
Pinecone,
};