mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-11 01:10:11 +01:00
Improve RAG results via chunkHeader append (#1473)
This commit is contained in:
parent
4ede4c71fc
commit
b23cb1a90f
@ -115,6 +115,11 @@ function SkeletonLine() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function omitChunkHeader(text) {
|
||||||
|
if (!text.startsWith("<document_metadata>")) return text;
|
||||||
|
return text.split("</document_metadata>")[1].trim();
|
||||||
|
}
|
||||||
|
|
||||||
function CitationDetailModal({ source, onClose }) {
|
function CitationDetailModal({ source, onClose }) {
|
||||||
const { references, title, chunks } = source;
|
const { references, title, chunks } = source;
|
||||||
const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source);
|
const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source);
|
||||||
@ -167,7 +172,7 @@ function CitationDetailModal({ source, onClose }) {
|
|||||||
<div key={idx} className="pt-6 text-white">
|
<div key={idx} className="pt-6 text-white">
|
||||||
<div className="flex flex-col w-full justify-start pb-6 gap-y-1">
|
<div className="flex flex-col w-full justify-start pb-6 gap-y-1">
|
||||||
<p className="text-white whitespace-pre-line">
|
<p className="text-white whitespace-pre-line">
|
||||||
{HTMLDecode(text)}
|
{HTMLDecode(omitChunkHeader(text))}
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
{!!score && (
|
{!!score && (
|
||||||
|
@ -17,6 +17,7 @@ class TextSplitter {
|
|||||||
Config: {
|
Config: {
|
||||||
chunkSize: number,
|
chunkSize: number,
|
||||||
chunkOverlap: number,
|
chunkOverlap: number,
|
||||||
|
chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
|
||||||
}
|
}
|
||||||
------
|
------
|
||||||
*/
|
*/
|
||||||
@ -44,6 +45,18 @@ class TextSplitter {
|
|||||||
return prefValue > limit ? limit : prefValue;
|
return prefValue > limit ? limit : prefValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stringifyHeader() {
|
||||||
|
if (!this.config.chunkHeaderMeta) return null;
|
||||||
|
let content = "";
|
||||||
|
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
|
||||||
|
if (!key || !value) return;
|
||||||
|
content += `${key}: ${value}\n`;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!content) return null;
|
||||||
|
return `<document_metadata>\n${content}</document_metadata>\n\n`;
|
||||||
|
}
|
||||||
|
|
||||||
#setSplitter(config = {}) {
|
#setSplitter(config = {}) {
|
||||||
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
|
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
|
||||||
return new RecursiveSplitter({
|
return new RecursiveSplitter({
|
||||||
@ -51,6 +64,7 @@ class TextSplitter {
|
|||||||
chunkOverlap: isNaN(config?.chunkOverlap)
|
chunkOverlap: isNaN(config?.chunkOverlap)
|
||||||
? 20
|
? 20
|
||||||
: Number(config?.chunkOverlap),
|
: Number(config?.chunkOverlap),
|
||||||
|
chunkHeader: this.stringifyHeader(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,11 +75,12 @@ class TextSplitter {
|
|||||||
|
|
||||||
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
|
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
|
||||||
class RecursiveSplitter {
|
class RecursiveSplitter {
|
||||||
constructor({ chunkSize, chunkOverlap }) {
|
constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
|
||||||
const {
|
const {
|
||||||
RecursiveCharacterTextSplitter,
|
RecursiveCharacterTextSplitter,
|
||||||
} = require("@langchain/textsplitters");
|
} = require("@langchain/textsplitters");
|
||||||
this.log(`Will split with`, { chunkSize, chunkOverlap });
|
this.log(`Will split with`, { chunkSize, chunkOverlap });
|
||||||
|
this.chunkHeader = chunkHeader;
|
||||||
this.engine = new RecursiveCharacterTextSplitter({
|
this.engine = new RecursiveCharacterTextSplitter({
|
||||||
chunkSize,
|
chunkSize,
|
||||||
chunkOverlap,
|
chunkOverlap,
|
||||||
@ -77,7 +92,14 @@ class RecursiveSplitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async _splitText(documentText) {
|
async _splitText(documentText) {
|
||||||
return this.engine.splitText(documentText);
|
if (!this.chunkHeader) return this.engine.splitText(documentText);
|
||||||
|
const strings = await this.engine.splitText(documentText);
|
||||||
|
const documents = await this.engine.createDocuments(strings, [], {
|
||||||
|
chunkHeader: this.chunkHeader,
|
||||||
|
});
|
||||||
|
return documents
|
||||||
|
.filter((doc) => !!doc.pageContent)
|
||||||
|
.map((doc) => doc.pageContent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -157,6 +157,10 @@ const AstraDB = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
@ -200,6 +200,10 @@ const Chroma = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
@ -198,6 +198,10 @@ const LanceDb = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
@ -192,6 +192,10 @@ const Milvus = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
@ -143,6 +143,10 @@ const PineconeDB = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
@ -217,6 +217,10 @@ const QDrant = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
@ -259,6 +259,10 @@ const Weaviate = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
@ -193,6 +193,10 @@ const Zilliz = {
|
|||||||
{ label: "text_splitter_chunk_overlap" },
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
20
|
20
|
||||||
),
|
),
|
||||||
|
chunkHeaderMeta: {
|
||||||
|
sourceDocument: metadata?.title,
|
||||||
|
published: metadata?.published || "unknown",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user