Improve RAG results via chunkHeader append (#1473)

This commit is contained in:
Timothy Carambat 2024-05-21 14:43:39 -05:00 committed by GitHub
parent 4ede4c71fc
commit b23cb1a90f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 62 additions and 3 deletions

View File

@ -115,6 +115,11 @@ function SkeletonLine() {
);
}
function omitChunkHeader(text) {
if (!text.startsWith("<document_metadata>")) return text;
return text.split("</document_metadata>")[1].trim();
}
function CitationDetailModal({ source, onClose }) {
const { references, title, chunks } = source;
const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source);
@ -167,7 +172,7 @@ function CitationDetailModal({ source, onClose }) {
<div key={idx} className="pt-6 text-white">
<div className="flex flex-col w-full justify-start pb-6 gap-y-1">
<p className="text-white whitespace-pre-line">
{HTMLDecode(text)}
{HTMLDecode(omitChunkHeader(text))}
</p>
{!!score && (

View File

@ -17,6 +17,7 @@ class TextSplitter {
Config: {
chunkSize: number,
chunkOverlap: number,
chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
}
------
*/
@ -44,6 +45,18 @@ class TextSplitter {
return prefValue > limit ? limit : prefValue;
}
stringifyHeader() {
if (!this.config.chunkHeaderMeta) return null;
let content = "";
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
if (!key || !value) return;
content += `${key}: ${value}\n`;
});
if (!content) return null;
return `<document_metadata>\n${content}</document_metadata>\n\n`;
}
#setSplitter(config = {}) {
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
return new RecursiveSplitter({
@ -51,6 +64,7 @@ class TextSplitter {
chunkOverlap: isNaN(config?.chunkOverlap)
? 20
: Number(config?.chunkOverlap),
chunkHeader: this.stringifyHeader(),
});
}
@ -61,11 +75,12 @@ class TextSplitter {
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
class RecursiveSplitter {
constructor({ chunkSize, chunkOverlap }) {
constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
const {
RecursiveCharacterTextSplitter,
} = require("@langchain/textsplitters");
this.log(`Will split with`, { chunkSize, chunkOverlap });
this.chunkHeader = chunkHeader;
this.engine = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap,
@ -77,7 +92,14 @@ class RecursiveSplitter {
}
async _splitText(documentText) {
return this.engine.splitText(documentText);
if (!this.chunkHeader) return this.engine.splitText(documentText);
const strings = await this.engine.splitText(documentText);
const documents = await this.engine.createDocuments(strings, [], {
chunkHeader: this.chunkHeader,
});
return documents
.filter((doc) => !!doc.pageContent)
.map((doc) => doc.pageContent);
}
}

View File

@ -157,6 +157,10 @@ const AstraDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -200,6 +200,10 @@ const Chroma = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -198,6 +198,10 @@ const LanceDb = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -192,6 +192,10 @@ const Milvus = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -143,6 +143,10 @@ const PineconeDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -217,6 +217,10 @@ const QDrant = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -259,6 +259,10 @@ const Weaviate = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -193,6 +193,10 @@ const Zilliz = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
});
const textChunks = await textSplitter.splitText(pageContent);