mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 20:50:09 +01:00
69 lines
2.6 KiB
JavaScript
69 lines
2.6 KiB
JavaScript
|
const { v4 } = require("uuid");
|
||
|
const { writeToServerDocuments } = require("../utils/files");
|
||
|
const { tokenizeString } = require("../utils/tokenizer");
|
||
|
const { default: slugify } = require("slugify");
|
||
|
|
||
|
// Will remove the last .extension from the input
|
||
|
// and stringify the input + move to lowercase.
|
||
|
function stripAndSlug(input) {
|
||
|
if (!input.includes('.')) return slugify(input, { lower: true });
|
||
|
return slugify(input.split('.').slice(0, -1).join('-'), { lower: true })
|
||
|
}
|
||
|
|
||
|
const METADATA_KEYS = {
|
||
|
possible: {
|
||
|
url: ({ url, title }) => {
|
||
|
let validUrl;
|
||
|
try {
|
||
|
const u = new URL(url);
|
||
|
validUrl = ["https:", "http:"].includes(u.protocol);
|
||
|
} catch { }
|
||
|
|
||
|
if (validUrl) return `web://${url.toLowerCase()}.website`;
|
||
|
return `file://${stripAndSlug(title)}.txt`;
|
||
|
},
|
||
|
title: ({ title }) => `${stripAndSlug(title)}.txt`,
|
||
|
docAuthor: ({ docAuthor }) => { return typeof docAuthor === 'string' ? docAuthor : 'no author specified' },
|
||
|
description: ({ description }) => { return typeof description === 'string' ? description : 'no description found' },
|
||
|
docSource: ({ docSource }) => { return typeof docSource === 'string' ? docSource : 'no source set' },
|
||
|
chunkSource: ({ chunkSource, title }) => { return typeof chunkSource === 'string' ? chunkSource : `${stripAndSlug(title)}.txt` },
|
||
|
published: ({ published }) => {
|
||
|
if (isNaN(Number(published))) return new Date().toLocaleString();
|
||
|
return new Date(Number(published)).toLocaleString()
|
||
|
},
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async function processRawText(textContent, metadata) {
|
||
|
console.log(`-- Working Raw Text doc ${metadata.title} --`);
|
||
|
if (!textContent || textContent.length === 0) {
|
||
|
return {
|
||
|
success: false,
|
||
|
reason: "textContent was empty - nothing to process.",
|
||
|
documents: [],
|
||
|
};
|
||
|
}
|
||
|
|
||
|
const data = {
|
||
|
id: v4(),
|
||
|
url: METADATA_KEYS.possible.url(metadata),
|
||
|
title: METADATA_KEYS.possible.title(metadata),
|
||
|
docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
|
||
|
description: METADATA_KEYS.possible.description(metadata),
|
||
|
docSource: METADATA_KEYS.possible.docSource(metadata),
|
||
|
chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
|
||
|
published: METADATA_KEYS.possible.published(metadata),
|
||
|
wordCount: textContent.split(" ").length,
|
||
|
pageContent: textContent,
|
||
|
token_count_estimate: tokenizeString(textContent).length,
|
||
|
};
|
||
|
|
||
|
const document = writeToServerDocuments(
|
||
|
data,
|
||
|
`raw-${stripAndSlug(metadata.title)}-${data.id}`
|
||
|
);
|
||
|
console.log(`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`);
|
||
|
return { success: true, reason: null, documents: [document] };
|
||
|
}
|
||
|
|
||
|
module.exports = { processRawText }
|