/** * @typedef {object} DocumentMetadata * @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000" * @property {string} url - eg; "file://example.com/index.html" * @property {string} title - eg; "example.com/index.html" * @property {string} docAuthor - eg; "no author found" * @property {string} description - eg; "No description found." * @property {string} docSource - eg; "URL link uploaded by the user." * @property {string} chunkSource - eg; link://https://example.com * @property {string} published - ISO 8601 date string * @property {number} wordCount - Number of words in the document * @property {string} pageContent - The raw text content of the document * @property {number} token_count_estimate - Number of tokens in the document */ function isNullOrNaN(value) { if (value === null) return true; return isNaN(value); } class TextSplitter { #splitter; constructor(config = {}) { /* config can be a ton of things depending on what is required or optional by the specific splitter. Non-splitter related keys { splitByFilename: string, // TODO } ------ Default: "RecursiveCharacterTextSplitter" Config: { chunkSize: number, chunkOverlap: number, chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata } ------ */ this.config = config; this.#splitter = this.#setSplitter(config); } log(text, ...args) { console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args); } /** * Does a quick check to determine the text chunk length limit. * Embedder models have hard-set limits that cannot be exceeded, just like an LLM context * so here we want to allow override of the default 1000, but up to the models maximum, which is * sometimes user defined. */ static determineMaxChunkSize(preferred = null, embedderLimit = 1000) { const prefValue = isNullOrNaN(preferred) ? Number(embedderLimit) : Number(preferred); const limit = Number(embedderLimit); if (prefValue > limit) console.log( `\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.` ); return prefValue > limit ? limit : prefValue; } /** * Creates a string of metadata to be prepended to each chunk. * @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk. * @returns {{[key: ('title' | 'published' | 'source')]: string}} Object of metadata that will be prepended to each chunk. */ static buildHeaderMeta(metadata = {}) { if (!metadata || Object.keys(metadata).length === 0) return null; const PLUCK_MAP = { title: { as: "sourceDocument", pluck: (metadata) => { return metadata?.title || null; }, }, published: { as: "published", pluck: (metadata) => { return metadata?.published || null; }, }, chunkSource: { as: "source", pluck: (metadata) => { const validPrefixes = ["link://", "youtube://"]; // If the chunkSource is a link or youtube link, we can add the URL // as its source in the metadata so the LLM can use it for context. // eg prompt: Where did you get this information? -> answer: "from https://example.com" if ( !metadata?.chunkSource || // Exists !metadata?.chunkSource.length || // Is not empty typeof metadata.chunkSource !== "string" || // Is a string !validPrefixes.some( (prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect ) ) return null; // We know a prefix is present, so we can split on it and return the rest. // If nothing is found, return null and it will not be added to the metadata. let source = null; for (const prefix of validPrefixes) { source = metadata.chunkSource.split(prefix)?.[1] || null; if (source) break; } return source; }, }, }; const pluckedData = {}; Object.entries(PLUCK_MAP).forEach(([key, value]) => { if (!(key in metadata)) return; // Skip if the metadata key is not present. const pluckedValue = value.pluck(metadata); if (!pluckedValue) return; // Skip if the plucked value is null/empty. pluckedData[value.as] = pluckedValue; }); return pluckedData; } /** * Creates a string of metadata to be prepended to each chunk. */ stringifyHeader() { if (!this.config.chunkHeaderMeta) return null; let content = ""; Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => { if (!key || !value) return; content += `${key}: ${value}\n`; }); if (!content) return null; return `\n${content}\n\n`; } #setSplitter(config = {}) { // if (!config?.splitByFilename) {// TODO do something when specific extension is present? } return new RecursiveSplitter({ chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize), chunkOverlap: isNaN(config?.chunkOverlap) ? 20 : Number(config?.chunkOverlap), chunkHeader: this.stringifyHeader(), }); } async splitText(documentText) { return this.#splitter._splitText(documentText); } } // Wrapper for Langchain default RecursiveCharacterTextSplitter class. class RecursiveSplitter { constructor({ chunkSize, chunkOverlap, chunkHeader = null }) { const { RecursiveCharacterTextSplitter, } = require("@langchain/textsplitters"); this.log(`Will split with`, { chunkSize, chunkOverlap }); this.chunkHeader = chunkHeader; this.engine = new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap, }); } log(text, ...args) { console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args); } async _splitText(documentText) { if (!this.chunkHeader) return this.engine.splitText(documentText); const strings = await this.engine.splitText(documentText); const documents = await this.engine.createDocuments(strings, [], { chunkHeader: this.chunkHeader, }); return documents .filter((doc) => !!doc.pageContent) .map((doc) => doc.pageContent); } } module.exports.TextSplitter = TextSplitter;