From b23cb1a90fd5ebe062a4567f1e68cd7200fbcac7 Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Tue, 21 May 2024 14:43:39 -0500 Subject: [PATCH] Improve RAG results via chunkHeader append (#1473) --- .../ChatHistory/Citation/index.jsx | 7 ++++- server/utils/TextSplitter/index.js | 26 +++++++++++++++++-- server/utils/vectorDbProviders/astra/index.js | 4 +++ .../utils/vectorDbProviders/chroma/index.js | 4 +++ server/utils/vectorDbProviders/lance/index.js | 4 +++ .../utils/vectorDbProviders/milvus/index.js | 4 +++ .../utils/vectorDbProviders/pinecone/index.js | 4 +++ .../utils/vectorDbProviders/qdrant/index.js | 4 +++ .../utils/vectorDbProviders/weaviate/index.js | 4 +++ .../utils/vectorDbProviders/zilliz/index.js | 4 +++ 10 files changed, 62 insertions(+), 3 deletions(-) diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx index a3a579c9..de4c4f72 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx @@ -115,6 +115,11 @@ function SkeletonLine() { ); } +function omitChunkHeader(text) { + if (!text.startsWith("")) return text; + return text.split("")[1].trim(); +} + function CitationDetailModal({ source, onClose }) { const { references, title, chunks } = source; const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source); @@ -167,7 +172,7 @@ function CitationDetailModal({ source, onClose }) {

- {HTMLDecode(text)} + {HTMLDecode(omitChunkHeader(text))}

{!!score && ( diff --git a/server/utils/TextSplitter/index.js b/server/utils/TextSplitter/index.js index f79fb87f..4162fa74 100644 --- a/server/utils/TextSplitter/index.js +++ b/server/utils/TextSplitter/index.js @@ -17,6 +17,7 @@ class TextSplitter { Config: { chunkSize: number, chunkOverlap: number, + chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata } ------ */ @@ -44,6 +45,18 @@ class TextSplitter { return prefValue > limit ? limit : prefValue; } + stringifyHeader() { + if (!this.config.chunkHeaderMeta) return null; + let content = ""; + Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => { + if (!key || !value) return; + content += `${key}: ${value}\n`; + }); + + if (!content) return null; + return `\n${content}\n\n`; + } + #setSplitter(config = {}) { // if (!config?.splitByFilename) {// TODO do something when specific extension is present? } return new RecursiveSplitter({ @@ -51,6 +64,7 @@ class TextSplitter { chunkOverlap: isNaN(config?.chunkOverlap) ? 20 : Number(config?.chunkOverlap), + chunkHeader: this.stringifyHeader(), }); } @@ -61,11 +75,12 @@ class TextSplitter { // Wrapper for Langchain default RecursiveCharacterTextSplitter class. class RecursiveSplitter { - constructor({ chunkSize, chunkOverlap }) { + constructor({ chunkSize, chunkOverlap, chunkHeader = null }) { const { RecursiveCharacterTextSplitter, } = require("@langchain/textsplitters"); this.log(`Will split with`, { chunkSize, chunkOverlap }); + this.chunkHeader = chunkHeader; this.engine = new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap, @@ -77,7 +92,14 @@ class RecursiveSplitter { } async _splitText(documentText) { - return this.engine.splitText(documentText); + if (!this.chunkHeader) return this.engine.splitText(documentText); + const strings = await this.engine.splitText(documentText); + const documents = await this.engine.createDocuments(strings, [], { + chunkHeader: this.chunkHeader, + }); + return documents + .filter((doc) => !!doc.pageContent) + .map((doc) => doc.pageContent); } } diff --git a/server/utils/vectorDbProviders/astra/index.js b/server/utils/vectorDbProviders/astra/index.js index 50e8ba34..30ff2bbf 100644 --- a/server/utils/vectorDbProviders/astra/index.js +++ b/server/utils/vectorDbProviders/astra/index.js @@ -157,6 +157,10 @@ const AstraDB = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js index d17883b7..90956a94 100644 --- a/server/utils/vectorDbProviders/chroma/index.js +++ b/server/utils/vectorDbProviders/chroma/index.js @@ -200,6 +200,10 @@ const Chroma = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js index db266295..54c12c04 100644 --- a/server/utils/vectorDbProviders/lance/index.js +++ b/server/utils/vectorDbProviders/lance/index.js @@ -198,6 +198,10 @@ const LanceDb = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/milvus/index.js b/server/utils/vectorDbProviders/milvus/index.js index 27309233..d720c265 100644 --- a/server/utils/vectorDbProviders/milvus/index.js +++ b/server/utils/vectorDbProviders/milvus/index.js @@ -192,6 +192,10 @@ const Milvus = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js index 9b68ef1b..d1aeb2f6 100644 --- a/server/utils/vectorDbProviders/pinecone/index.js +++ b/server/utils/vectorDbProviders/pinecone/index.js @@ -143,6 +143,10 @@ const PineconeDB = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js index e8511d0b..ff55c06f 100644 --- a/server/utils/vectorDbProviders/qdrant/index.js +++ b/server/utils/vectorDbProviders/qdrant/index.js @@ -217,6 +217,10 @@ const QDrant = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js index f19329a4..978e2557 100644 --- a/server/utils/vectorDbProviders/weaviate/index.js +++ b/server/utils/vectorDbProviders/weaviate/index.js @@ -259,6 +259,10 @@ const Weaviate = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/zilliz/index.js b/server/utils/vectorDbProviders/zilliz/index.js index a7ee0438..ebb59157 100644 --- a/server/utils/vectorDbProviders/zilliz/index.js +++ b/server/utils/vectorDbProviders/zilliz/index.js @@ -193,6 +193,10 @@ const Zilliz = { { label: "text_splitter_chunk_overlap" }, 20 ), + chunkHeaderMeta: { + sourceDocument: metadata?.title, + published: metadata?.published || "unknown", + }, }); const textChunks = await textSplitter.splitText(pageContent);