mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-11 01:10:11 +01:00
85 lines
2.5 KiB
JavaScript
85 lines
2.5 KiB
JavaScript
|
function isNullOrNaN(value) {
|
||
|
if (value === null) return true;
|
||
|
return isNaN(value);
|
||
|
}
|
||
|
|
||
|
class TextSplitter {
|
||
|
#splitter;
|
||
|
constructor(config = {}) {
|
||
|
/*
|
||
|
config can be a ton of things depending on what is required or optional by the specific splitter.
|
||
|
Non-splitter related keys
|
||
|
{
|
||
|
splitByFilename: string, // TODO
|
||
|
}
|
||
|
------
|
||
|
Default: "RecursiveCharacterTextSplitter"
|
||
|
Config: {
|
||
|
chunkSize: number,
|
||
|
chunkOverlap: number,
|
||
|
}
|
||
|
------
|
||
|
*/
|
||
|
this.config = config;
|
||
|
this.#splitter = this.#setSplitter(config);
|
||
|
}
|
||
|
|
||
|
log(text, ...args) {
|
||
|
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
|
||
|
}
|
||
|
|
||
|
// Does a quick check to determine the text chunk length limit.
|
||
|
// Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
|
||
|
// so here we want to allow override of the default 1000, but up to the models maximum, which is
|
||
|
// sometimes user defined.
|
||
|
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
|
||
|
const prefValue = isNullOrNaN(preferred)
|
||
|
? Number(embedderLimit)
|
||
|
: Number(preferred);
|
||
|
const limit = Number(embedderLimit);
|
||
|
if (prefValue > limit)
|
||
|
console.log(
|
||
|
`\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
|
||
|
);
|
||
|
return prefValue > limit ? limit : prefValue;
|
||
|
}
|
||
|
|
||
|
#setSplitter(config = {}) {
|
||
|
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
|
||
|
return new RecursiveSplitter({
|
||
|
chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),
|
||
|
chunkOverlap: isNaN(config?.chunkOverlap)
|
||
|
? 20
|
||
|
: Number(config?.chunkOverlap),
|
||
|
});
|
||
|
}
|
||
|
|
||
|
async splitText(documentText) {
|
||
|
return this.#splitter._splitText(documentText);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
|
||
|
class RecursiveSplitter {
|
||
|
constructor({ chunkSize, chunkOverlap }) {
|
||
|
const {
|
||
|
RecursiveCharacterTextSplitter,
|
||
|
} = require("langchain/text_splitter");
|
||
|
this.log(`Will split with`, { chunkSize, chunkOverlap });
|
||
|
this.engine = new RecursiveCharacterTextSplitter({
|
||
|
chunkSize,
|
||
|
chunkOverlap,
|
||
|
});
|
||
|
}
|
||
|
|
||
|
log(text, ...args) {
|
||
|
console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
|
||
|
}
|
||
|
|
||
|
async _splitText(documentText) {
|
||
|
return this.engine.splitText(documentText);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
module.exports.TextSplitter = TextSplitter;
|