anything-llm/collector/processRawText/index.js

const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../utils/files");
const { tokenizeString } = require("../utils/tokenizer");
const { default: slugify } = require("slugify");

// Will remove the last .extension from the input 
// and stringify the input + move to lowercase.
function stripAndSlug(input) {
  if (!input.includes('.')) return slugify(input, { lower: true });
  return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
}

const METADATA_KEYS = {
  possible: {
    url: ({ url, title }) => {
      let validUrl;
      try {
        const u = new URL(url);
        validUrl = ["https:", "http:"].includes(u.protocol);
      } catch { }

      if (validUrl) return `web://${url.toLowerCase()}.website`;
      return `file://${stripAndSlug(title)}.txt`;
    },
    title: ({ title }) => `${stripAndSlug(title)}.txt`,
    docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
    description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
    docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
    chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
    published: ({ published }) => {
      if (isNaN(Number(published))) return new Date().toLocaleString();
      return new Date(Number(published)).toLocaleString()
    },
  }
}

async function processRawText(textContent, metadata) {
  console.log(`-- Working Raw Text doc ${metadata.title} --`);
  if (!textContent || textContent.length === 0) {
    return {
      success: false,
      reason: "textContent was empty - nothing to process.",
      documents: [],
    };
  }

  const data = {
    id: v4(),
    url: METADATA_KEYS.possible.url(metadata),
    title: METADATA_KEYS.possible.title(metadata),
    docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
    description: METADATA_KEYS.possible.description(metadata),
    docSource: METADATA_KEYS.possible.docSource(metadata),
    chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
    published: METADATA_KEYS.possible.published(metadata),
    wordCount: textContent.split(" ").length,
    pageContent: textContent,
    token_count_estimate: tokenizeString(textContent).length,
  };

  const document = writeToServerDocuments(
    data,
    `raw-${stripAndSlug(metadata.title)}-${data.id}`
  );
  console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
  return { success: true, reason: null, documents: [document] };
}

module.exports = { processRawText }
Add support to upload rawText document via api (#692) * Add support to upload rawText document via api * update API doc endpoint with correct textContent key * update response swagger doc 2024-02-08 00:17:32 +01:00			`const { v4 } = require("uuid");`
			`const { writeToServerDocuments } = require("../utils/files");`
			`const { tokenizeString } = require("../utils/tokenizer");`
			`const { default: slugify } = require("slugify");`

			`// Will remove the last .extension from the input`
			`// and stringify the input + move to lowercase.`
			`function stripAndSlug(input) {`
			`if (!input.includes('.')) return slugify(input, { lower: true });`
			`return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })`
			`}`

			`const METADATA_KEYS = {`
			`possible: {`
			`url: ({ url, title }) => {`
			`let validUrl;`
			`try {`
			`const u = new URL(url);`
			`validUrl = ["https:", "http:"].includes(u.protocol);`
			`} catch { }`

			if (validUrl) return `web://${url.toLowerCase()}.website`;
			return `file://${stripAndSlug(title)}.txt`;
			`},`
			title: ({ title }) => `${stripAndSlug(title)}.txt`,
			`docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },`
			`description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },`
			`docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },`
			chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
			`published: ({ published }) => {`
			`if (isNaN(Number(published))) return new Date().toLocaleString();`
			`return new Date(Number(published)).toLocaleString()`
			`},`
			`}`
			`}`

			`async function processRawText(textContent, metadata) {`
			console.log(`-- Working Raw Text doc ${metadata.title} --`);
			`if (!textContent \|\| textContent.length === 0) {`
			`return {`
			`success: false,`
			`reason: "textContent was empty - nothing to process.",`
			`documents: [],`
			`};`
			`}`

			`const data = {`
			`id: v4(),`
			`url: METADATA_KEYS.possible.url(metadata),`
			`title: METADATA_KEYS.possible.title(metadata),`
			`docAuthor: METADATA_KEYS.possible.docAuthor(metadata),`
			`description: METADATA_KEYS.possible.description(metadata),`
			`docSource: METADATA_KEYS.possible.docSource(metadata),`
			`chunkSource: METADATA_KEYS.possible.chunkSource(metadata),`
			`published: METADATA_KEYS.possible.published(metadata),`
			`wordCount: textContent.split(" ").length,`
			`pageContent: textContent,`
			`token_count_estimate: tokenizeString(textContent).length,`
			`};`

			`const document = writeToServerDocuments(`
			`data,`
			`raw-${stripAndSlug(metadata.title)}-${data.id}`
			`);`
			console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
			`return { success: true, reason: null, documents: [document] };`
			`}`

			`module.exports = { processRawText }`