anything-llm/server/utils/TextSplitter/index.js

/**
 * @typedef {object} DocumentMetadata
 * @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000"
 * @property {string} url - eg; "file://example.com/index.html"
 * @property {string} title - eg; "example.com/index.html"
 * @property {string} docAuthor - eg; "no author found"
 * @property {string} description - eg; "No description found."
 * @property {string} docSource - eg; "URL link uploaded by the user."
 * @property {string} chunkSource - eg; link://https://example.com
 * @property {string} published - ISO 8601 date string
 * @property {number} wordCount - Number of words in the document
 * @property {string} pageContent - The raw text content of the document
 * @property {number} token_count_estimate - Number of tokens in the document
 */

function isNullOrNaN(value) {
  if (value === null) return true;
  return isNaN(value);
}

class TextSplitter {
  #splitter;
  constructor(config = {}) {
    /*
      config can be a ton of things depending on what is required or optional by the specific splitter.
      Non-splitter related keys
      {
        splitByFilename: string, // TODO
      }
      ------
      Default: "RecursiveCharacterTextSplitter"
      Config: {
        chunkSize: number,
        chunkOverlap: number,
        chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
      }
      ------
    */
    this.config = config;
    this.#splitter = this.#setSplitter(config);
  }

  log(text, ...args) {
    console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
  }

  /**
   *  Does a quick check to determine the text chunk length limit.
   * Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
   * so here we want to allow override of the default 1000, but up to the models maximum, which is
   * sometimes user defined.
   */
  static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
    const prefValue = isNullOrNaN(preferred)
      ? Number(embedderLimit)
      : Number(preferred);
    const limit = Number(embedderLimit);
    if (prefValue > limit)
      console.log(
        `\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
      );
    return prefValue > limit ? limit : prefValue;
  }

  /**
   *  Creates a string of metadata to be prepended to each chunk.
   * @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk.
   * @returns {{[key: ('title' | 'published' | 'source')]: string}} Object of metadata that will be prepended to each chunk.
   */
  static buildHeaderMeta(metadata = {}) {
    if (!metadata || Object.keys(metadata).length === 0) return null;
    const PLUCK_MAP = {
      title: {
        as: "sourceDocument",
        pluck: (metadata) => {
          return metadata?.title || null;
        },
      },
      published: {
        as: "published",
        pluck: (metadata) => {
          return metadata?.published || null;
        },
      },
      chunkSource: {
        as: "source",
        pluck: (metadata) => {
          const validPrefixes = ["link://", "youtube://"];
          // If the chunkSource is a link or youtube link, we can add the URL
          // as its source in the metadata so the LLM can use it for context.
          // eg prompt: Where did you get this information? -> answer: "from https://example.com"
          if (
            !metadata?.chunkSource || // Exists
            !metadata?.chunkSource.length || // Is not empty
            typeof metadata.chunkSource !== "string" || // Is a string
            !validPrefixes.some(
              (prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect
            )
          )
            return null;

          // We know a prefix is present, so we can split on it and return the rest.
          // If nothing is found, return null and it will not be added to the metadata.
          let source = null;
          for (const prefix of validPrefixes) {
            source = metadata.chunkSource.split(prefix)?.[1] || null;
            if (source) break;
          }

          return source;
        },
      },
    };

    const pluckedData = {};
    Object.entries(PLUCK_MAP).forEach(([key, value]) => {
      if (!(key in metadata)) return; // Skip if the metadata key is not present.
      const pluckedValue = value.pluck(metadata);
      if (!pluckedValue) return; // Skip if the plucked value is null/empty.
      pluckedData[value.as] = pluckedValue;
    });

    return pluckedData;
  }

  /**
   *  Creates a string of metadata to be prepended to each chunk.
   */
  stringifyHeader() {
    if (!this.config.chunkHeaderMeta) return null;
    let content = "";
    Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
      if (!key || !value) return;
      content += `${key}: ${value}\n`;
    });

    if (!content) return null;
    return `<document_metadata>\n${content}</document_metadata>\n\n`;
  }

  #setSplitter(config = {}) {
    // if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
    return new RecursiveSplitter({
      chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),
      chunkOverlap: isNaN(config?.chunkOverlap)
        ? 20
        : Number(config?.chunkOverlap),
      chunkHeader: this.stringifyHeader(),
    });
  }

  async splitText(documentText) {
    return this.#splitter._splitText(documentText);
  }
}

// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
class RecursiveSplitter {
  constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
    const {
      RecursiveCharacterTextSplitter,
    } = require("@langchain/textsplitters");
    this.log(`Will split with`, { chunkSize, chunkOverlap });
    this.chunkHeader = chunkHeader;
    this.engine = new RecursiveCharacterTextSplitter({
      chunkSize,
      chunkOverlap,
    });
  }

  log(text, ...args) {
    console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
  }

  async _splitText(documentText) {
    if (!this.chunkHeader) return this.engine.splitText(documentText);
    const strings = await this.engine.splitText(documentText);
    const documents = await this.engine.createDocuments(strings, [], {
      chunkHeader: this.chunkHeader,
    });
    return documents
      .filter((doc) => !!doc.pageContent)
      .map((doc) => doc.pageContent);
  }
}

module.exports.TextSplitter = TextSplitter;
Add header static class for metadata assembly (#2567) * Add header static class for metadata assembly * update comments * patch header parsing for links 2024-11-04 20:47:46 +01:00			`/**`
			`* @typedef {object} DocumentMetadata`
			`* @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000"`
			`* @property {string} url - eg; "file://example.com/index.html"`
			`* @property {string} title - eg; "example.com/index.html"`
			`* @property {string} docAuthor - eg; "no author found"`
			`* @property {string} description - eg; "No description found."`
			`* @property {string} docSource - eg; "URL link uploaded by the user."`
			`* @property {string} chunkSource - eg; link://https://example.com`
			`* @property {string} published - ISO 8601 date string`
			`* @property {number} wordCount - Number of words in the document`
			`* @property {string} pageContent - The raw text content of the document`
			`* @property {number} token_count_estimate - Number of tokens in the document`
			`*/`

Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`function isNullOrNaN(value) {`
			`if (value === null) return true;`
			`return isNaN(value);`
			`}`

			`class TextSplitter {`
			`#splitter;`
			`constructor(config = {}) {`
			`/*`
			`config can be a ton of things depending on what is required or optional by the specific splitter.`
			`Non-splitter related keys`
			`{`
			`splitByFilename: string, // TODO`
			`}`
			`------`
			`Default: "RecursiveCharacterTextSplitter"`
			`Config: {`
			`chunkSize: number,`
			`chunkOverlap: number,`
Improve RAG results via chunkHeader append (#1473) 2024-05-21 21:43:39 +02:00			`chunkHeaderMeta: object \| null, // Gets appended to top of each chunk as metadata`
Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`}`
			`------`
			`*/`
			`this.config = config;`
			`this.#splitter = this.#setSplitter(config);`
			`}`

			`log(text, ...args) {`
			console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
			`}`

Add header static class for metadata assembly (#2567) * Add header static class for metadata assembly * update comments * patch header parsing for links 2024-11-04 20:47:46 +01:00			`/**`
			`* Does a quick check to determine the text chunk length limit.`
			`* Embedder models have hard-set limits that cannot be exceeded, just like an LLM context`
			`* so here we want to allow override of the default 1000, but up to the models maximum, which is`
			`* sometimes user defined.`
			`*/`
Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {`
			`const prefValue = isNullOrNaN(preferred)`
			`? Number(embedderLimit)`
			`: Number(preferred);`
			`const limit = Number(embedderLimit);`
			`if (prefValue > limit)`
			`console.log(`
			`\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
			`);`
			`return prefValue > limit ? limit : prefValue;`
			`}`

Add header static class for metadata assembly (#2567) * Add header static class for metadata assembly * update comments * patch header parsing for links 2024-11-04 20:47:46 +01:00			`/**`
			`* Creates a string of metadata to be prepended to each chunk.`
			`* @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk.`
			`* @returns {{[key: ('title' \| 'published' \| 'source')]: string}} Object of metadata that will be prepended to each chunk.`
			`*/`
			`static buildHeaderMeta(metadata = {}) {`
			`if (!metadata \|\| Object.keys(metadata).length === 0) return null;`
			`const PLUCK_MAP = {`
			`title: {`
			`as: "sourceDocument",`
			`pluck: (metadata) => {`
			`return metadata?.title \|\| null;`
			`},`
			`},`
			`published: {`
			`as: "published",`
			`pluck: (metadata) => {`
			`return metadata?.published \|\| null;`
			`},`
			`},`
			`chunkSource: {`
			`as: "source",`
			`pluck: (metadata) => {`
			`const validPrefixes = ["link://", "youtube://"];`
			`// If the chunkSource is a link or youtube link, we can add the URL`
			`// as its source in the metadata so the LLM can use it for context.`
			`// eg prompt: Where did you get this information? -> answer: "from https://example.com"`
			`if (`
			`!metadata?.chunkSource \|\| // Exists`
			`!metadata?.chunkSource.length \|\| // Is not empty`
			`typeof metadata.chunkSource !== "string" \|\| // Is a string`
			`!validPrefixes.some(`
			`(prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect`
			`)`
			`)`
			`return null;`

			`// We know a prefix is present, so we can split on it and return the rest.`
			`// If nothing is found, return null and it will not be added to the metadata.`
			`let source = null;`
			`for (const prefix of validPrefixes) {`
			`source = metadata.chunkSource.split(prefix)?.[1] \|\| null;`
			`if (source) break;`
			`}`

			`return source;`
			`},`
			`},`
			`};`

			`const pluckedData = {};`
			`Object.entries(PLUCK_MAP).forEach(([key, value]) => {`
			`if (!(key in metadata)) return; // Skip if the metadata key is not present.`
			`const pluckedValue = value.pluck(metadata);`
			`if (!pluckedValue) return; // Skip if the plucked value is null/empty.`
			`pluckedData[value.as] = pluckedValue;`
			`});`

			`return pluckedData;`
			`}`

			`/**`
			`* Creates a string of metadata to be prepended to each chunk.`
			`*/`
Improve RAG results via chunkHeader append (#1473) 2024-05-21 21:43:39 +02:00			`stringifyHeader() {`
			`if (!this.config.chunkHeaderMeta) return null;`
			`let content = "";`
			`Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {`
			`if (!key \|\| !value) return;`
			content += `${key}: ${value}\n`;
			`});`

			`if (!content) return null;`
			return `<document_metadata>\n${content}</document_metadata>\n\n`;
			`}`

Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`#setSplitter(config = {}) {`
			`// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }`
			`return new RecursiveSplitter({`
			`chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),`
			`chunkOverlap: isNaN(config?.chunkOverlap)`
			`? 20`
			`: Number(config?.chunkOverlap),`
Improve RAG results via chunkHeader append (#1473) 2024-05-21 21:43:39 +02:00			`chunkHeader: this.stringifyHeader(),`
Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`});`
			`}`

			`async splitText(documentText) {`
			`return this.#splitter._splitText(documentText);`
			`}`
			`}`

			`// Wrapper for Langchain default RecursiveCharacterTextSplitter class.`
			`class RecursiveSplitter {`
Improve RAG results via chunkHeader append (#1473) 2024-05-21 21:43:39 +02:00			`constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {`
Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`const {`
			`RecursiveCharacterTextSplitter,`
bump langchain deps (#1231) * bump langchain deps * patch native and ollama providers remove deprecated deps --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> 2024-04-30 21:04:24 +02:00			`} = require("@langchain/textsplitters");`
Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			this.log(`Will split with`, { chunkSize, chunkOverlap });
Improve RAG results via chunkHeader append (#1473) 2024-05-21 21:43:39 +02:00			`this.chunkHeader = chunkHeader;`
Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`this.engine = new RecursiveCharacterTextSplitter({`
			`chunkSize,`
			`chunkOverlap,`
			`});`
			`}`

			`log(text, ...args) {`
			console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
			`}`

			`async _splitText(documentText) {`
Improve RAG results via chunkHeader append (#1473) 2024-05-21 21:43:39 +02:00			`if (!this.chunkHeader) return this.engine.splitText(documentText);`
			`const strings = await this.engine.splitText(documentText);`
			`const documents = await this.engine.createDocuments(strings, [], {`
			`chunkHeader: this.chunkHeader,`
			`});`
			`return documents`
			`.filter((doc) => !!doc.pageContent)`
			`.map((doc) => doc.pageContent);`
Enable customization of chunk length and overlap (#1059) * Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size 2024-04-07 01:38:07 +02:00			`}`
			`}`

			`module.exports.TextSplitter = TextSplitter;`