Update all vector dbs to filter duplicate source documents that may be pinned (#1122)

* Update all vector dbs to filter duplicate parents

* cleanup
This commit is contained in:
Timothy Carambat 2024-04-17 18:04:39 -07:00 committed by GitHub
parent 41978765bc
commit 9655880cf0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 131 additions and 21 deletions

View File

@ -1,6 +1,6 @@
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { getVectorDbClass, getLLMProvider } = require("../helpers"); const { getVectorDbClass, getLLMProvider } = require("../helpers");
const { chatPrompt } = require("./index"); const { chatPrompt, sourceIdentifier } = require("./index");
const { EmbedChats } = require("../../models/embedChats"); const { EmbedChats } = require("../../models/embedChats");
const { const {
convertToPromptHistory, convertToPromptHistory,
@ -69,6 +69,7 @@ async function streamChatWithForEmbed(
let completeText; let completeText;
let contextTexts = []; let contextTexts = [];
let sources = []; let sources = [];
let pinnedDocIdentifiers = [];
const { rawHistory, chatHistory } = await recentEmbedChatHistory( const { rawHistory, chatHistory } = await recentEmbedChatHistory(
sessionId, sessionId,
embed, embed,
@ -86,6 +87,7 @@ async function streamChatWithForEmbed(
.then((pinnedDocs) => { .then((pinnedDocs) => {
pinnedDocs.forEach((doc) => { pinnedDocs.forEach((doc) => {
const { pageContent, ...metadata } = doc; const { pageContent, ...metadata } = doc;
pinnedDocIdentifiers.push(sourceIdentifier(doc));
contextTexts.push(doc.pageContent); contextTexts.push(doc.pageContent);
sources.push({ sources.push({
text: text:
@ -104,6 +106,7 @@ async function streamChatWithForEmbed(
LLMConnector, LLMConnector,
similarityThreshold: embed.workspace?.similarityThreshold, similarityThreshold: embed.workspace?.similarityThreshold,
topN: embed.workspace?.topN, topN: embed.workspace?.topN,
filterIdentifiers: pinnedDocIdentifiers,
}) })
: { : {
contextTexts: [], contextTexts: [],

View File

@ -79,6 +79,7 @@ async function chatWithWorkspace(
// 2. Chatting in "query" mode and has at least 1 embedding // 2. Chatting in "query" mode and has at least 1 embedding
let contextTexts = []; let contextTexts = [];
let sources = []; let sources = [];
let pinnedDocIdentifiers = [];
const { rawHistory, chatHistory } = await recentChatHistory({ const { rawHistory, chatHistory } = await recentChatHistory({
user, user,
workspace, workspace,
@ -97,6 +98,7 @@ async function chatWithWorkspace(
.then((pinnedDocs) => { .then((pinnedDocs) => {
pinnedDocs.forEach((doc) => { pinnedDocs.forEach((doc) => {
const { pageContent, ...metadata } = doc; const { pageContent, ...metadata } = doc;
pinnedDocIdentifiers.push(sourceIdentifier(doc));
contextTexts.push(doc.pageContent); contextTexts.push(doc.pageContent);
sources.push({ sources.push({
text: text:
@ -115,6 +117,7 @@ async function chatWithWorkspace(
LLMConnector, LLMConnector,
similarityThreshold: workspace?.similarityThreshold, similarityThreshold: workspace?.similarityThreshold,
topN: workspace?.topN, topN: workspace?.topN,
filterIdentifiers: pinnedDocIdentifiers,
}) })
: { : {
contextTexts: [], contextTexts: [],
@ -227,7 +230,18 @@ function chatPrompt(workspace) {
); );
} }
// We use this util function to deduplicate sources from similarity searching
// if the document is already pinned.
// Eg: You pin a csv, if we RAG + full-text that you will get the same data
// points both in the full-text and possibly from RAG - result in bad results
// even if the LLM was not even going to hallucinate.
function sourceIdentifier(sourceDocument) {
if (!sourceDocument?.title || !sourceDocument?.published) return uuidv4();
return `title:${sourceDocument.title}-timestamp:${sourceDocument.published}`;
}
module.exports = { module.exports = {
sourceIdentifier,
recentChatHistory, recentChatHistory,
chatWithWorkspace, chatWithWorkspace,
chatPrompt, chatPrompt,

View File

@ -9,6 +9,7 @@ const {
VALID_COMMANDS, VALID_COMMANDS,
chatPrompt, chatPrompt,
recentChatHistory, recentChatHistory,
sourceIdentifier,
} = require("./index"); } = require("./index");
const VALID_CHAT_MODE = ["chat", "query"]; const VALID_CHAT_MODE = ["chat", "query"];
@ -92,6 +93,7 @@ async function streamChatWithWorkspace(
let completeText; let completeText;
let contextTexts = []; let contextTexts = [];
let sources = []; let sources = [];
let pinnedDocIdentifiers = [];
const { rawHistory, chatHistory } = await recentChatHistory({ const { rawHistory, chatHistory } = await recentChatHistory({
user, user,
workspace, workspace,
@ -110,6 +112,7 @@ async function streamChatWithWorkspace(
.then((pinnedDocs) => { .then((pinnedDocs) => {
pinnedDocs.forEach((doc) => { pinnedDocs.forEach((doc) => {
const { pageContent, ...metadata } = doc; const { pageContent, ...metadata } = doc;
pinnedDocIdentifiers.push(sourceIdentifier(doc));
contextTexts.push(doc.pageContent); contextTexts.push(doc.pageContent);
sources.push({ sources.push({
text: text:
@ -128,6 +131,7 @@ async function streamChatWithWorkspace(
LLMConnector, LLMConnector,
similarityThreshold: workspace?.similarityThreshold, similarityThreshold: workspace?.similarityThreshold,
topN: workspace?.topN, topN: workspace?.topN,
filterIdentifiers: pinnedDocIdentifiers,
}) })
: { : {
contextTexts: [], contextTexts: [],

View File

@ -8,6 +8,7 @@ const {
getLLMProvider, getLLMProvider,
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const AstraDB = { const AstraDB = {
name: "AstraDB", name: "AstraDB",
@ -252,6 +253,7 @@ const AstraDB = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -272,7 +274,8 @@ const AstraDB = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {
@ -289,7 +292,8 @@ const AstraDB = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const result = { const result = {
contextTexts: [], contextTexts: [],
@ -311,6 +315,12 @@ const AstraDB = {
responses.forEach((response) => { responses.forEach((response) => {
if (response.$similarity < similarityThreshold) return; if (response.$similarity < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(response.metadata))) {
console.log(
"AstraDB: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(response.metadata.text); result.contextTexts.push(response.metadata.text);
result.sourceDocuments.push(response); result.sourceDocuments.push(response);
result.scores.push(response.$similarity); result.scores.push(response.$similarity);

View File

@ -9,6 +9,7 @@ const {
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { parseAuthHeader } = require("../../http"); const { parseAuthHeader } = require("../../http");
const { sourceIdentifier } = require("../../chats");
const Chroma = { const Chroma = {
name: "Chroma", name: "Chroma",
@ -70,7 +71,8 @@ const Chroma = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const collection = await client.getCollection({ name: namespace }); const collection = await client.getCollection({ name: namespace });
const result = { const result = {
@ -89,6 +91,15 @@ const Chroma = {
similarityThreshold similarityThreshold
) )
return; return;
if (
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
) {
console.log(
"Chroma: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(response.documents[0][i]); result.contextTexts.push(response.documents[0][i]);
result.sourceDocuments.push(response.metadatas[0][i]); result.sourceDocuments.push(response.metadatas[0][i]);
result.scores.push(this.distanceToSimilarity(response.distances[0][i])); result.scores.push(this.distanceToSimilarity(response.distances[0][i]));
@ -282,6 +293,7 @@ const Chroma = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -301,7 +313,8 @@ const Chroma = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {

View File

@ -9,6 +9,7 @@ const { TextSplitter } = require("../../TextSplitter");
const { SystemSettings } = require("../../../models/systemSettings"); const { SystemSettings } = require("../../../models/systemSettings");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { sourceIdentifier } = require("../../chats");
const LanceDb = { const LanceDb = {
uri: `${ uri: `${
@ -64,7 +65,8 @@ const LanceDb = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const collection = await client.openTable(namespace); const collection = await client.openTable(namespace);
const result = { const result = {
@ -82,6 +84,13 @@ const LanceDb = {
response.forEach((item) => { response.forEach((item) => {
if (this.distanceToSimilarity(item.score) < similarityThreshold) return; if (this.distanceToSimilarity(item.score) < similarityThreshold) return;
const { vector: _, ...rest } = item; const { vector: _, ...rest } = item;
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
console.log(
"LanceDB: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(rest.text); result.contextTexts.push(rest.text);
result.sourceDocuments.push(rest); result.sourceDocuments.push(rest);
result.scores.push(this.distanceToSimilarity(item.score)); result.scores.push(this.distanceToSimilarity(item.score));
@ -250,6 +259,7 @@ const LanceDb = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -269,7 +279,8 @@ const LanceDb = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {

View File

@ -13,6 +13,7 @@ const {
getLLMProvider, getLLMProvider,
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const Milvus = { const Milvus = {
name: "Milvus", name: "Milvus",
@ -288,6 +289,7 @@ const Milvus = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -307,7 +309,8 @@ const Milvus = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {
@ -324,7 +327,8 @@ const Milvus = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const result = { const result = {
contextTexts: [], contextTexts: [],
@ -338,6 +342,13 @@ const Milvus = {
}); });
response.results.forEach((match) => { response.results.forEach((match) => {
if (match.score < similarityThreshold) return; if (match.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
console.log(
"Milvus: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(match.metadata.text); result.contextTexts.push(match.metadata.text);
result.sourceDocuments.push(match); result.sourceDocuments.push(match);
result.scores.push(match.score); result.scores.push(match.score);

View File

@ -8,6 +8,7 @@ const {
getLLMProvider, getLLMProvider,
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const PineconeDB = { const PineconeDB = {
name: "Pinecone", name: "Pinecone",
@ -44,7 +45,8 @@ const PineconeDB = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const result = { const result = {
contextTexts: [], contextTexts: [],
@ -61,6 +63,13 @@ const PineconeDB = {
response.matches.forEach((match) => { response.matches.forEach((match) => {
if (match.score < similarityThreshold) return; if (match.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
console.log(
"Pinecone: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(match.metadata.text); result.contextTexts.push(match.metadata.text);
result.sourceDocuments.push(match); result.sourceDocuments.push(match);
result.scores.push(match.score); result.scores.push(match.score);
@ -233,6 +242,7 @@ const PineconeDB = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -249,7 +259,8 @@ const PineconeDB = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {

View File

@ -8,6 +8,7 @@ const {
getLLMProvider, getLLMProvider,
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const QDrant = { const QDrant = {
name: "QDrant", name: "QDrant",
@ -55,7 +56,8 @@ const QDrant = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const { client } = await this.connect(); const { client } = await this.connect();
const result = { const result = {
@ -72,6 +74,13 @@ const QDrant = {
responses.forEach((response) => { responses.forEach((response) => {
if (response.score < similarityThreshold) return; if (response.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(response?.payload))) {
console.log(
"QDrant: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(response?.payload?.text || ""); result.contextTexts.push(response?.payload?.text || "");
result.sourceDocuments.push({ result.sourceDocuments.push({
...(response?.payload || {}), ...(response?.payload || {}),
@ -146,7 +155,8 @@ const QDrant = {
const { client } = await this.connect(); const { client } = await this.connect();
const { chunks } = cacheResult; const { chunks } = cacheResult;
const documentVectors = []; const documentVectors = [];
vectorDimension = chunks[0][0].vector.length || null; vectorDimension =
chunks[0][0]?.vector?.length ?? chunks[0][0]?.values?.length ?? null;
const collection = await this.getOrCreateCollection( const collection = await this.getOrCreateCollection(
client, client,
@ -311,6 +321,7 @@ const QDrant = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -330,7 +341,8 @@ const QDrant = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {

View File

@ -9,6 +9,7 @@ const {
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { camelCase } = require("../../helpers/camelcase"); const { camelCase } = require("../../helpers/camelcase");
const { sourceIdentifier } = require("../../chats");
const Weaviate = { const Weaviate = {
name: "Weaviate", name: "Weaviate",
@ -82,7 +83,8 @@ const Weaviate = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const result = { const result = {
contextTexts: [], contextTexts: [],
@ -91,7 +93,8 @@ const Weaviate = {
}; };
const weaviateClass = await this.namespace(client, namespace); const weaviateClass = await this.namespace(client, namespace);
const fields = weaviateClass.properties.map((prop) => prop.name).join(" "); const fields =
weaviateClass.properties?.map((prop) => prop.name)?.join(" ") ?? "";
const queryResponse = await client.graphql const queryResponse = await client.graphql
.get() .get()
.withClassName(camelCase(namespace)) .withClassName(camelCase(namespace))
@ -109,6 +112,12 @@ const Weaviate = {
...rest ...rest
} = response; } = response;
if (certainty < similarityThreshold) return; if (certainty < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
console.log(
"Weaviate: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(rest.text); result.contextTexts.push(rest.text);
result.sourceDocuments.push({ ...rest, id }); result.sourceDocuments.push({ ...rest, id });
result.scores.push(certainty); result.scores.push(certainty);
@ -214,7 +223,7 @@ const Weaviate = {
chunk.forEach((chunk) => { chunk.forEach((chunk) => {
const id = uuidv4(); const id = uuidv4();
const flattenedMetadata = this.flattenObjectForWeaviate( const flattenedMetadata = this.flattenObjectForWeaviate(
chunk.properties chunk.properties ?? chunk.metadata
); );
documentVectors.push({ docId, vectorId: id }); documentVectors.push({ docId, vectorId: id });
const vectorRecord = { const vectorRecord = {
@ -357,6 +366,7 @@ const Weaviate = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -376,7 +386,8 @@ const Weaviate = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {
@ -437,7 +448,7 @@ const Weaviate = {
const flattenedObject = {}; const flattenedObject = {};
for (const key in obj) { for (const key in obj) {
if (!Object.hasOwn(obj, key)) { if (!Object.hasOwn(obj, key) || key === "id") {
continue; continue;
} }
const value = obj[key]; const value = obj[key];

View File

@ -13,6 +13,7 @@ const {
getLLMProvider, getLLMProvider,
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
// Zilliz is basically a copy of Milvus DB class with a different constructor // Zilliz is basically a copy of Milvus DB class with a different constructor
// to connect to the cloud // to connect to the cloud
@ -289,6 +290,7 @@ const Zilliz = {
LLMConnector = null, LLMConnector = null,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4, topN = 4,
filterIdentifiers = [],
}) { }) {
if (!namespace || !input || !LLMConnector) if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch."); throw new Error("Invalid request to performSimilaritySearch.");
@ -308,7 +310,8 @@ const Zilliz = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold, similarityThreshold,
topN topN,
filterIdentifiers
); );
const sources = sourceDocuments.map((metadata, i) => { const sources = sourceDocuments.map((metadata, i) => {
@ -325,7 +328,8 @@ const Zilliz = {
namespace, namespace,
queryVector, queryVector,
similarityThreshold = 0.25, similarityThreshold = 0.25,
topN = 4 topN = 4,
filterIdentifiers = []
) { ) {
const result = { const result = {
contextTexts: [], contextTexts: [],
@ -339,6 +343,12 @@ const Zilliz = {
}); });
response.results.forEach((match) => { response.results.forEach((match) => {
if (match.score < similarityThreshold) return; if (match.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
console.log(
"Zilliz: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(match.metadata.text); result.contextTexts.push(match.metadata.text);
result.sourceDocuments.push(match); result.sourceDocuments.push(match);
result.scores.push(match.score); result.scores.push(match.score);