anything-llm/collector/processRawText/index.js

69 lines
2.6 KiB
JavaScript
Raw Normal View History

const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../utils/files");
const { tokenizeString } = require("../utils/tokenizer");
const { default: slugify } = require("slugify");
// Will remove the last .extension from the input
// and stringify the input + move to lowercase.
function stripAndSlug(input) {
if (!input.includes('.')) return slugify(input, { lower: true });
return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
}
const METADATA_KEYS = {
possible: {
url: ({ url, title }) => {
let validUrl;
try {
const u = new URL(url);
validUrl = ["https:", "http:"].includes(u.protocol);
} catch { }
if (validUrl) return `web://${url.toLowerCase()}.website`;
return `file://${stripAndSlug(title)}.txt`;
},
title: ({ title }) => `${stripAndSlug(title)}.txt`,
docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
published: ({ published }) => {
if (isNaN(Number(published))) return new Date().toLocaleString();
return new Date(Number(published)).toLocaleString()
},
}
}
async function processRawText(textContent, metadata) {
console.log(`-- Working Raw Text doc ${metadata.title} --`);
if (!textContent || textContent.length === 0) {
return {
success: false,
reason: "textContent was empty - nothing to process.",
documents: [],
};
}
const data = {
id: v4(),
url: METADATA_KEYS.possible.url(metadata),
title: METADATA_KEYS.possible.title(metadata),
docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
description: METADATA_KEYS.possible.description(metadata),
docSource: METADATA_KEYS.possible.docSource(metadata),
chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
published: METADATA_KEYS.possible.published(metadata),
wordCount: textContent.split(" ").length,
pageContent: textContent,
token_count_estimate: tokenizeString(textContent).length,
};
const document = writeToServerDocuments(
data,
`raw-${stripAndSlug(metadata.title)}-${data.id}`
);
console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
return { success: true, reason: null, documents: [document] };
}
module.exports = { processRawText }