- {HTMLDecode(text)}
+ {HTMLDecode(omitChunkHeader(text))}
{!!score && (
diff --git a/server/utils/TextSplitter/index.js b/server/utils/TextSplitter/index.js
index f79fb87f..4162fa74 100644
--- a/server/utils/TextSplitter/index.js
+++ b/server/utils/TextSplitter/index.js
@@ -17,6 +17,7 @@ class TextSplitter {
Config: {
chunkSize: number,
chunkOverlap: number,
+ chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
}
------
*/
@@ -44,6 +45,18 @@ class TextSplitter {
return prefValue > limit ? limit : prefValue;
}
+ stringifyHeader() {
+ if (!this.config.chunkHeaderMeta) return null;
+ let content = "";
+ Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
+ if (!key || !value) return;
+ content += `${key}: ${value}\n`;
+ });
+
+ if (!content) return null;
+ return `
\n${content}\n\n`;
+ }
+
#setSplitter(config = {}) {
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
return new RecursiveSplitter({
@@ -51,6 +64,7 @@ class TextSplitter {
chunkOverlap: isNaN(config?.chunkOverlap)
? 20
: Number(config?.chunkOverlap),
+ chunkHeader: this.stringifyHeader(),
});
}
@@ -61,11 +75,12 @@ class TextSplitter {
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
class RecursiveSplitter {
- constructor({ chunkSize, chunkOverlap }) {
+ constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
const {
RecursiveCharacterTextSplitter,
} = require("@langchain/textsplitters");
this.log(`Will split with`, { chunkSize, chunkOverlap });
+ this.chunkHeader = chunkHeader;
this.engine = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap,
@@ -77,7 +92,14 @@ class RecursiveSplitter {
}
async _splitText(documentText) {
- return this.engine.splitText(documentText);
+ if (!this.chunkHeader) return this.engine.splitText(documentText);
+ const strings = await this.engine.splitText(documentText);
+ const documents = await this.engine.createDocuments(strings, [], {
+ chunkHeader: this.chunkHeader,
+ });
+ return documents
+ .filter((doc) => !!doc.pageContent)
+ .map((doc) => doc.pageContent);
}
}
diff --git a/server/utils/vectorDbProviders/astra/index.js b/server/utils/vectorDbProviders/astra/index.js
index 50e8ba34..30ff2bbf 100644
--- a/server/utils/vectorDbProviders/astra/index.js
+++ b/server/utils/vectorDbProviders/astra/index.js
@@ -157,6 +157,10 @@ const AstraDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js
index d17883b7..90956a94 100644
--- a/server/utils/vectorDbProviders/chroma/index.js
+++ b/server/utils/vectorDbProviders/chroma/index.js
@@ -200,6 +200,10 @@ const Chroma = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js
index db266295..54c12c04 100644
--- a/server/utils/vectorDbProviders/lance/index.js
+++ b/server/utils/vectorDbProviders/lance/index.js
@@ -198,6 +198,10 @@ const LanceDb = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/milvus/index.js b/server/utils/vectorDbProviders/milvus/index.js
index 27309233..d720c265 100644
--- a/server/utils/vectorDbProviders/milvus/index.js
+++ b/server/utils/vectorDbProviders/milvus/index.js
@@ -192,6 +192,10 @@ const Milvus = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js
index 9b68ef1b..d1aeb2f6 100644
--- a/server/utils/vectorDbProviders/pinecone/index.js
+++ b/server/utils/vectorDbProviders/pinecone/index.js
@@ -143,6 +143,10 @@ const PineconeDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js
index e8511d0b..ff55c06f 100644
--- a/server/utils/vectorDbProviders/qdrant/index.js
+++ b/server/utils/vectorDbProviders/qdrant/index.js
@@ -217,6 +217,10 @@ const QDrant = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js
index f19329a4..978e2557 100644
--- a/server/utils/vectorDbProviders/weaviate/index.js
+++ b/server/utils/vectorDbProviders/weaviate/index.js
@@ -259,6 +259,10 @@ const Weaviate = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/zilliz/index.js b/server/utils/vectorDbProviders/zilliz/index.js
index a7ee0438..ebb59157 100644
--- a/server/utils/vectorDbProviders/zilliz/index.js
+++ b/server/utils/vectorDbProviders/zilliz/index.js
@@ -193,6 +193,10 @@ const Zilliz = {
{ label: "text_splitter_chunk_overlap" },
20
),
+ chunkHeaderMeta: {
+ sourceDocument: metadata?.title,
+ published: metadata?.published || "unknown",
+ },
});
const textChunks = await textSplitter.splitText(pageContent);