mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-10 17:00:11 +01:00
Improve RAG results via chunkHeader append (#1473)
This commit is contained in:
parent
4ede4c71fc
commit
b23cb1a90f
@ -115,6 +115,11 @@ function SkeletonLine() {
|
||||
);
|
||||
}
|
||||
|
||||
function omitChunkHeader(text) {
|
||||
if (!text.startsWith("<document_metadata>")) return text;
|
||||
return text.split("</document_metadata>")[1].trim();
|
||||
}
|
||||
|
||||
function CitationDetailModal({ source, onClose }) {
|
||||
const { references, title, chunks } = source;
|
||||
const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source);
|
||||
@ -167,7 +172,7 @@ function CitationDetailModal({ source, onClose }) {
|
||||
<div key={idx} className="pt-6 text-white">
|
||||
<div className="flex flex-col w-full justify-start pb-6 gap-y-1">
|
||||
<p className="text-white whitespace-pre-line">
|
||||
{HTMLDecode(text)}
|
||||
{HTMLDecode(omitChunkHeader(text))}
|
||||
</p>
|
||||
|
||||
{!!score && (
|
||||
|
@ -17,6 +17,7 @@ class TextSplitter {
|
||||
Config: {
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
|
||||
}
|
||||
------
|
||||
*/
|
||||
@ -44,6 +45,18 @@ class TextSplitter {
|
||||
return prefValue > limit ? limit : prefValue;
|
||||
}
|
||||
|
||||
stringifyHeader() {
|
||||
if (!this.config.chunkHeaderMeta) return null;
|
||||
let content = "";
|
||||
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
|
||||
if (!key || !value) return;
|
||||
content += `${key}: ${value}\n`;
|
||||
});
|
||||
|
||||
if (!content) return null;
|
||||
return `<document_metadata>\n${content}</document_metadata>\n\n`;
|
||||
}
|
||||
|
||||
#setSplitter(config = {}) {
|
||||
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
|
||||
return new RecursiveSplitter({
|
||||
@ -51,6 +64,7 @@ class TextSplitter {
|
||||
chunkOverlap: isNaN(config?.chunkOverlap)
|
||||
? 20
|
||||
: Number(config?.chunkOverlap),
|
||||
chunkHeader: this.stringifyHeader(),
|
||||
});
|
||||
}
|
||||
|
||||
@ -61,11 +75,12 @@ class TextSplitter {
|
||||
|
||||
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
|
||||
class RecursiveSplitter {
|
||||
constructor({ chunkSize, chunkOverlap }) {
|
||||
constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
|
||||
const {
|
||||
RecursiveCharacterTextSplitter,
|
||||
} = require("@langchain/textsplitters");
|
||||
this.log(`Will split with`, { chunkSize, chunkOverlap });
|
||||
this.chunkHeader = chunkHeader;
|
||||
this.engine = new RecursiveCharacterTextSplitter({
|
||||
chunkSize,
|
||||
chunkOverlap,
|
||||
@ -77,7 +92,14 @@ class RecursiveSplitter {
|
||||
}
|
||||
|
||||
async _splitText(documentText) {
|
||||
return this.engine.splitText(documentText);
|
||||
if (!this.chunkHeader) return this.engine.splitText(documentText);
|
||||
const strings = await this.engine.splitText(documentText);
|
||||
const documents = await this.engine.createDocuments(strings, [], {
|
||||
chunkHeader: this.chunkHeader,
|
||||
});
|
||||
return documents
|
||||
.filter((doc) => !!doc.pageContent)
|
||||
.map((doc) => doc.pageContent);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -157,6 +157,10 @@ const AstraDB = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
@ -200,6 +200,10 @@ const Chroma = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
@ -198,6 +198,10 @@ const LanceDb = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
@ -192,6 +192,10 @@ const Milvus = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
@ -143,6 +143,10 @@ const PineconeDB = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
@ -217,6 +217,10 @@ const QDrant = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
@ -259,6 +259,10 @@ const Weaviate = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
@ -193,6 +193,10 @@ const Zilliz = {
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: {
|
||||
sourceDocument: metadata?.title,
|
||||
published: metadata?.published || "unknown",
|
||||
},
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user