Improve RAG results via chunkHeader append (#1473)

This commit is contained in:
Timothy Carambat 2024-05-21 14:43:39 -05:00 committed by GitHub
parent 4ede4c71fc
commit b23cb1a90f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 62 additions and 3 deletions

View File

@ -115,6 +115,11 @@ function SkeletonLine() {
); );
} }
function omitChunkHeader(text) {
if (!text.startsWith("<document_metadata>")) return text;
return text.split("</document_metadata>")[1].trim();
}
function CitationDetailModal({ source, onClose }) { function CitationDetailModal({ source, onClose }) {
const { references, title, chunks } = source; const { references, title, chunks } = source;
const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source); const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source);
@ -167,7 +172,7 @@ function CitationDetailModal({ source, onClose }) {
<div key={idx} className="pt-6 text-white"> <div key={idx} className="pt-6 text-white">
<div className="flex flex-col w-full justify-start pb-6 gap-y-1"> <div className="flex flex-col w-full justify-start pb-6 gap-y-1">
<p className="text-white whitespace-pre-line"> <p className="text-white whitespace-pre-line">
{HTMLDecode(text)} {HTMLDecode(omitChunkHeader(text))}
</p> </p>
{!!score && ( {!!score && (

View File

@ -17,6 +17,7 @@ class TextSplitter {
Config: { Config: {
chunkSize: number, chunkSize: number,
chunkOverlap: number, chunkOverlap: number,
chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
} }
------ ------
*/ */
@ -44,6 +45,18 @@ class TextSplitter {
return prefValue > limit ? limit : prefValue; return prefValue > limit ? limit : prefValue;
} }
stringifyHeader() {
if (!this.config.chunkHeaderMeta) return null;
let content = "";
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
if (!key || !value) return;
content += `${key}: ${value}\n`;
});
if (!content) return null;
return `<document_metadata>\n${content}</document_metadata>\n\n`;
}
#setSplitter(config = {}) { #setSplitter(config = {}) {
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? } // if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
return new RecursiveSplitter({ return new RecursiveSplitter({
@ -51,6 +64,7 @@ class TextSplitter {
chunkOverlap: isNaN(config?.chunkOverlap) chunkOverlap: isNaN(config?.chunkOverlap)
? 20 ? 20
: Number(config?.chunkOverlap), : Number(config?.chunkOverlap),
chunkHeader: this.stringifyHeader(),
}); });
} }
@ -61,11 +75,12 @@ class TextSplitter {
// Wrapper for Langchain default RecursiveCharacterTextSplitter class. // Wrapper for Langchain default RecursiveCharacterTextSplitter class.
class RecursiveSplitter { class RecursiveSplitter {
constructor({ chunkSize, chunkOverlap }) { constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
const { const {
RecursiveCharacterTextSplitter, RecursiveCharacterTextSplitter,
} = require("@langchain/textsplitters"); } = require("@langchain/textsplitters");
this.log(`Will split with`, { chunkSize, chunkOverlap }); this.log(`Will split with`, { chunkSize, chunkOverlap });
this.chunkHeader = chunkHeader;
this.engine = new RecursiveCharacterTextSplitter({ this.engine = new RecursiveCharacterTextSplitter({
chunkSize, chunkSize,
chunkOverlap, chunkOverlap,
@ -77,7 +92,14 @@ class RecursiveSplitter {
} }
async _splitText(documentText) { async _splitText(documentText) {
return this.engine.splitText(documentText); if (!this.chunkHeader) return this.engine.splitText(documentText);
const strings = await this.engine.splitText(documentText);
const documents = await this.engine.createDocuments(strings, [], {
chunkHeader: this.chunkHeader,
});
return documents
.filter((doc) => !!doc.pageContent)
.map((doc) => doc.pageContent);
} }
} }

View File

@ -157,6 +157,10 @@ const AstraDB = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View File

@ -200,6 +200,10 @@ const Chroma = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View File

@ -198,6 +198,10 @@ const LanceDb = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View File

@ -192,6 +192,10 @@ const Milvus = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View File

@ -143,6 +143,10 @@ const PineconeDB = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View File

@ -217,6 +217,10 @@ const QDrant = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View File

@ -259,6 +259,10 @@ const Weaviate = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View File

@ -193,6 +193,10 @@ const Zilliz = {
{ label: "text_splitter_chunk_overlap" }, { label: "text_splitter_chunk_overlap" },
20 20
), ),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);