diff --git a/README.md b/README.md index ff50a8587..68653bddc 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,8 @@ Some cool features of AnythingLLM - [LM Studio (all models)](https://lmstudio.ai) - [LocalAi (all models)](https://localai.io/) - [Together AI (chat models)](https://www.together.ai/) +- [Perplexity (chat models)](https://www.perplexity.ai/) +- [OpenRouter (chat models)](https://openrouter.ai/) - [Mistral](https://mistral.ai/) **Supported Embedding models:** @@ -80,6 +82,7 @@ Some cool features of AnythingLLM - [Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service) - [LM Studio (all)](https://lmstudio.ai) - [LocalAi (all)](https://localai.io/) +- [Ollama (all)](https://ollama.ai/) **Supported Vector Databases:** @@ -108,8 +111,8 @@ Mintplex Labs & the community maintain a number of deployment methods, scripts, |----------------------------------------|----:|-----|---------------|------------| | [![Deploy on Docker][docker-btn]][docker-deploy] | [![Deploy on AWS][aws-btn]][aws-deploy] | [![Deploy on GCP][gcp-btn]][gcp-deploy] | [![Deploy on DigitalOcean][do-btn]][aws-deploy] | [![Deploy on Render.com][render-btn]][render-deploy] | -| Railway | -|----------------------------------------| +| Railway | +| --------------------------------------------------- | | [![Deploy on Railway][railway-btn]][railway-deploy] | [or set up a production AnythingLLM instance without Docker →](./BARE_METAL.md) diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index 9efd3a70f..569a2cde2 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -4,7 +4,7 @@ const { WATCH_DIRECTORY, SUPPORTED_FILETYPE_CONVERTERS, } = require("../utils/constants"); -const { trashFile } = require("../utils/files"); +const { trashFile, isTextType } = require("../utils/files"); const RESERVED_FILES = ["__HOTDIR__.md"]; async function processSingleFile(targetFilename) { @@ -31,17 +31,25 @@ async function processSingleFile(targetFilename) { }; } - if (!Object.keys(SUPPORTED_FILETYPE_CONVERTERS).includes(fileExtension)) { - trashFile(fullFilePath); - return { - success: false, - reason: `File extension ${fileExtension} not supported for parsing.`, - documents: [], - }; + let processFileAs = fileExtension; + if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(fileExtension)) { + if (isTextType(fullFilePath)) { + console.log( + `\x1b[33m[Collector]\x1b[0m The provided filetype of ${fileExtension} does not have a preset and will be processed as .txt.` + ); + processFileAs = ".txt"; + } else { + trashFile(fullFilePath); + return { + success: false, + reason: `File extension ${fileExtension} not supported for parsing and cannot be assumed as text file type.`, + documents: [], + }; + } } const FileTypeProcessor = require(SUPPORTED_FILETYPE_CONVERTERS[ - fileExtension + processFileAs ]); return await FileTypeProcessor({ fullFilePath, diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index fc4e2d9cc..269567c40 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -1,10 +1,38 @@ const fs = require("fs"); const path = require("path"); +const { getType } = require("mime"); const documentsFolder = process.env.NODE_ENV === "production" ? path.resolve("/storage/documents") // hardcoded to Render storage mount. : path.resolve(__dirname, "../../../server/storage/documents"); +function isTextType(filepath) { + if (!fs.existsSync(filepath)) return false; + // These are types of mime primary classes that for sure + // cannot also for forced into a text type. + const nonTextTypes = ["multipart", "image", "model", "audio", "video"]; + // These are full-mimes we for sure cannot parse or interpret as text + // documents + const BAD_MIMES = [ + "application/octet-stream", + "application/zip", + "application/pkcs8", + "application/vnd.microsoft.portable-executable", + "application/x-msdownload", + ]; + + try { + const mime = getType(filepath); + if (BAD_MIMES.includes(mime)) return false; + + const type = mime.split("/")[0]; + if (nonTextTypes.includes(type)) return false; + return true; + } catch { + return false; + } +} + function trashFile(filepath) { if (!fs.existsSync(filepath)) return; @@ -97,6 +125,7 @@ async function wipeCollectorStorage() { module.exports = { documentsFolder, trashFile, + isTextType, createdDate, writeToServerDocuments, wipeCollectorStorage, diff --git a/docker/.env.example b/docker/.env.example index b14d3c6ed..ba33bd5c0 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -48,6 +48,14 @@ GID='1000' # MISTRAL_API_KEY='example-mistral-ai-api-key' # MISTRAL_MODEL_PREF='mistral-tiny' +# LLM_PROVIDER='perplexity' +# PERPLEXITY_API_KEY='my-perplexity-key' +# PERPLEXITY_MODEL_PREF='codellama-34b-instruct' + +# LLM_PROVIDER='openrouter' +# OPENROUTER_API_KEY='my-openrouter-key' +# OPENROUTER_MODEL_PREF='openrouter/auto' + # LLM_PROVIDER='huggingface' # HUGGING_FACE_LLM_ENDPOINT=https://uuid-here.us-east-1.aws.endpoints.huggingface.cloud # HUGGING_FACE_LLM_API_KEY=hf_xxxxxx @@ -71,6 +79,11 @@ GID='1000' # EMBEDDING_MODEL_PREF='text-embedding-ada-002' # EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be +# EMBEDDING_ENGINE='ollama' +# EMBEDDING_BASE_PATH='http://127.0.0.1:11434' +# EMBEDDING_MODEL_PREF='nomic-embed-text:latest' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192 + ########################################### ######## Vector Database Selection ######## ########################################### diff --git a/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx b/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx index c782c51f3..209c0aa21 100644 --- a/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx +++ b/frontend/src/components/EmbeddingSelection/AzureAiOptions/index.jsx @@ -9,7 +9,7 @@ export default function AzureAiOptions({ settings }) { -
+
{name}
-
- {description} -
+
{description}
diff --git a/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx b/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx index 651d3e950..8c611cd31 100644 --- a/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx +++ b/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx @@ -19,7 +19,7 @@ export default function LocalAiOptions({ settings }) { setBasePathValue(e.target.value)} @@ -41,7 +41,7 @@ export default function LocalAiOptions({ settings }) { e.target.blur()} @@ -62,7 +62,7 @@ export default function LocalAiOptions({ settings }) {