diff --git a/aws/cloudformation/DEPLOY.md b/aws/cloudformation/DEPLOY.md index 7f1ee382..4b0d21e1 100644 --- a/aws/cloudformation/DEPLOY.md +++ b/aws/cloudformation/DEPLOY.md @@ -11,7 +11,7 @@ The output of this cloudformation stack will be: **Requirements** - An AWS account with billing information. - - AnythingLLM can run within the free tier using a t2.micro and 10Gib SSD hard disk volume + - AnythingLLM (GUI + document processor) must use a t2.small minimum and 10Gib SSD hard disk volume - `.env` file that is filled out with your settings and set up in the `docker/` folder ## How to deploy on AWS diff --git a/aws/cloudformation/cf_template.template b/aws/cloudformation/cf_template.template index ee6996b3..405be8cd 100644 --- a/aws/cloudformation/cf_template.template +++ b/aws/cloudformation/cf_template.template @@ -5,13 +5,13 @@ "InstanceType": { "Description": "EC2 instance type", "Type": "String", - "Default": "t2.micro" + "Default": "t2.small" }, "InstanceVolume": { "Description": "Storage size of disk on Instance in GB", "Type": "Number", "Default": 10, - "MinValue": 2 + "MinValue": 4 } }, "Resources": { @@ -96,7 +96,6 @@ "!SUB::USER::CONTENT!", "UID=\"1000\"\n", "GID=\"1000\"\n", - "CLOUD_BUILD=1\n", "END\n", "cd ../frontend\n", "rm -rf .env.production\n", @@ -105,6 +104,17 @@ "VITE_API_BASE=\"/api\"\n", "END\n", "sudo docker-compose -f /home/ec2-user/anything-llm/docker/docker-compose.yml up -d\n", + "echo \"Container ID: $(sudo docker ps --latest --quiet)\"\n", + "sudo docker container exec -u 0 -t $(sudo docker ps --latest --quiet) mkdir -p /app/server/storage /app/server/storage/documents /app/server/storage/vector-cache /app/server/storage/lancedb\n", + "echo \"Placeholder folders in storage created.\"\n", + "sudo docker container exec -u 0 -t $(sudo docker ps --latest --quiet) touch /app/server/storage/anythingllm.db\n", + "echo \"SQLite DB placeholder set.\"\n", + "sudo docker container exec -u 0 -t $(sudo docker ps --latest --quiet) chown -R anythingllm:anythingllm /app/collector /app/server\n", + "echo \"File permissions corrected.\"\n", + "export ONLINE=$(curl -Is http://localhost:3001/api/ping | head -n 1|cut -d$' ' -f2)\n", + "echo \"Health check: $ONLINE\"\n", + "if [ \"$ONLINE\" = 200 ] ; then echo \"Running migrations...\" && curl -Is http://localhost:3001/api/migrate | head -n 1|cut -d$' ' -f2; fi\n", + "echo \"Setup complete! AnythingLLM instance is now online!\"\n", "\n", "--//--\n" ] diff --git a/collector/.gitignore b/collector/.gitignore index f62b514d..3aee7f97 100644 --- a/collector/.gitignore +++ b/collector/.gitignore @@ -1,6 +1,8 @@ outputs/*/*.json hotdir/* hotdir/processed/* +hotdir/failed/* !hotdir/__HOTDIR__.md !hotdir/processed +!hotdir/failed diff --git a/collector/README.md b/collector/README.md index eb46b2fa..c7af796b 100644 --- a/collector/README.md +++ b/collector/README.md @@ -43,3 +43,9 @@ If collection fails at any point in the process it will pick up where it last ba - [Enable YouTube Data APIV3](https://console.cloud.google.com/apis/library/youtube.googleapis.com) - Once enabled generate a Credential key for this API - Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file. + +### Running the document processing API locally +From the `collector` directory with the `v-env` active run `flask run --host '0.0.0.0' --port 8888`. +Now uploads from the frontend will be processed as if you ran the `watch.py` script manually. + +**Docker**: If you run this application via docker the API is already started for you and no additional action is needed. \ No newline at end of file diff --git a/collector/api.py b/collector/api.py new file mode 100644 index 00000000..71d3af59 --- /dev/null +++ b/collector/api.py @@ -0,0 +1,21 @@ +from flask import Flask, json, request +from scripts.watch.process_single import process_single +from scripts.watch.filetypes import ACCEPTED_MIMES +api = Flask(__name__) + +WATCH_DIRECTORY = "hotdir" +@api.route('/process', methods=['POST']) +def process_file(): + content = request.json + target_filename = content.get('filename') + print(f"Processing {target_filename}") + success, reason = process_single(WATCH_DIRECTORY, target_filename) + return json.dumps({'filename': target_filename, 'success': success, 'reason': reason}) + +@api.route('/accepts', methods=['GET']) +def get_accepted_filetypes(): + return json.dumps(ACCEPTED_MIMES) + +@api.route('/', methods=['GET']) +def root(): + return "
Use POST /process with filename key in JSON body in order to process a file. File by that name must exist in hotdir already.
" \ No newline at end of file diff --git a/collector/requirements.txt b/collector/requirements.txt index ddae5487..1ab1d706 100644 --- a/collector/requirements.txt +++ b/collector/requirements.txt @@ -9,6 +9,7 @@ async-timeout==4.0.2 attrs==23.1.0 backoff==2.2.1 beautifulsoup4==4.12.2 +blinker==1.6.2 bs4==0.0.1 certifi==2023.5.7 cffi==1.15.1 @@ -24,21 +25,26 @@ docx2txt==0.8 et-xmlfile==1.1.0 exceptiongroup==1.1.1 fake-useragent==1.1.3 +Flask==2.3.2 frozenlist==1.3.3 grapheme==0.6.0 greenlet==2.0.2 +gunicorn==20.1.0 h11==0.14.0 httpcore==0.16.3 httpx==0.23.3 idna==3.4 -InquirerPy==0.3.4 importlib-metadata==6.6.0 importlib-resources==5.12.0 +inquirerpy==0.3.4 install==1.3.5 +itsdangerous==2.1.2 +Jinja2==3.1.2 joblib==1.2.0 langchain==0.0.189 lxml==4.9.2 Markdown==3.4.3 +MarkupSafe==2.1.3 marshmallow==3.19.0 marshmallow-enum==1.5.1 monotonic==1.6 @@ -55,6 +61,7 @@ packaging==23.1 pandas==1.5.3 parse==1.19.0 pdfminer.six==20221105 +pfzy==0.3.4 Pillow==9.5.0 prompt-toolkit==3.0.38 pycparser==2.21 @@ -96,6 +103,7 @@ uuid==1.30 w3lib==2.1.1 wcwidth==0.2.6 websockets==10.4 +Werkzeug==2.3.6 wrapt==1.14.1 xlrd==2.0.1 XlsxWriter==3.1.2 diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py index a639a143..ade70e57 100644 --- a/collector/scripts/watch/convert/as_docx.py +++ b/collector/scripts/watch/convert/as_docx.py @@ -9,6 +9,7 @@ def as_docx(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') filename = kwargs.get('filename') ext = kwargs.get('ext', '.txt') + remove = kwargs.get('remove_on_complete', False) fullpath = f"{parent_dir}/{filename}{ext}" loader = Docx2txtLoader(fullpath) @@ -28,13 +29,14 @@ def as_docx(**kwargs): } write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") - move_source(parent_dir, f"{filename}{ext}") + move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") def as_odt(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') filename = kwargs.get('filename') ext = kwargs.get('ext', '.txt') + remove = kwargs.get('remove_on_complete', False) fullpath = f"{parent_dir}/{filename}{ext}" loader = UnstructuredODTLoader(fullpath) @@ -54,5 +56,5 @@ def as_odt(**kwargs): } write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") - move_source(parent_dir, f"{filename}{ext}") + move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py index 4c68d473..49cf538c 100644 --- a/collector/scripts/watch/convert/as_markdown.py +++ b/collector/scripts/watch/convert/as_markdown.py @@ -9,6 +9,7 @@ def as_markdown(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') filename = kwargs.get('filename') ext = kwargs.get('ext', '.txt') + remove = kwargs.get('remove_on_complete', False) fullpath = f"{parent_dir}/{filename}{ext}" loader = UnstructuredMarkdownLoader(fullpath) @@ -28,5 +29,5 @@ def as_markdown(**kwargs): } write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") - move_source(parent_dir, f"{filename}{ext}") + move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py index 53cee00a..605c29c1 100644 --- a/collector/scripts/watch/convert/as_pdf.py +++ b/collector/scripts/watch/convert/as_pdf.py @@ -9,6 +9,7 @@ def as_pdf(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') filename = kwargs.get('filename') ext = kwargs.get('ext', '.txt') + remove = kwargs.get('remove_on_complete', False) fullpath = f"{parent_dir}/{filename}{ext}" loader = PyPDFLoader(fullpath) @@ -32,5 +33,5 @@ def as_pdf(**kwargs): } write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}") - move_source(parent_dir, f"{filename}{ext}") + move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py index b7935d62..a9935b48 100644 --- a/collector/scripts/watch/convert/as_text.py +++ b/collector/scripts/watch/convert/as_text.py @@ -8,6 +8,7 @@ def as_text(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') filename = kwargs.get('filename') ext = kwargs.get('ext', '.txt') + remove = kwargs.get('remove_on_complete', False) fullpath = f"{parent_dir}/{filename}{ext}" content = open(fullpath).read() @@ -24,5 +25,5 @@ def as_text(**kwargs): } write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") - move_source(parent_dir, f"{filename}{ext}") + move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") \ No newline at end of file diff --git a/collector/scripts/watch/filetypes.py b/collector/scripts/watch/filetypes.py index 5e2d818f..fc93c5b8 100644 --- a/collector/scripts/watch/filetypes.py +++ b/collector/scripts/watch/filetypes.py @@ -9,4 +9,11 @@ FILETYPES = { '.pdf': as_pdf, '.docx': as_docx, '.odt': as_odt, +} + +ACCEPTED_MIMES = { + 'text/plain': ['.txt', '.md'], + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'application/vnd.oasis.opendocument.text': ['.odt'], + 'application/pdf': ['.pdf'], } \ No newline at end of file diff --git a/collector/scripts/watch/main.py b/collector/scripts/watch/main.py index f3bd3a1c..152e13bc 100644 --- a/collector/scripts/watch/main.py +++ b/collector/scripts/watch/main.py @@ -1,5 +1,6 @@ import os from .filetypes import FILETYPES +from .utils import move_source RESERVED = ['__HOTDIR__.md'] def watch_for_changes(directory): @@ -10,7 +11,8 @@ def watch_for_changes(directory): if filename in ['.DS_Store'] or fileext == '': continue if fileext not in FILETYPES.keys(): - print(f"{fileext} not a supported file type for conversion. Please remove from hot directory.") + print(f"{fileext} not a supported file type for conversion. Removing from hot directory.") + move_source(new_destination_filename=raw_doc, failed=True) continue FILETYPES[fileext]( diff --git a/collector/scripts/watch/process_single.py b/collector/scripts/watch/process_single.py new file mode 100644 index 00000000..f41219eb --- /dev/null +++ b/collector/scripts/watch/process_single.py @@ -0,0 +1,35 @@ +import os +from .filetypes import FILETYPES +from .utils import move_source + +RESERVED = ['__HOTDIR__.md'] + +# This script will do a one-off processing of a specific document that exists in hotdir. +# For this function we remove the original source document since there is no need to keep it and it will +# only occupy additional disk space. +def process_single(directory, target_doc): + if os.path.isdir(f"{directory}/{target_doc}") or target_doc in RESERVED: return (False, "Not a file") + + if os.path.exists(f"{directory}/{target_doc}") is False: + print(f"{directory}/{target_doc} does not exist.") + return (False, f"{directory}/{target_doc} does not exist.") + + filename, fileext = os.path.splitext(target_doc) + if filename in ['.DS_Store'] or fileext == '': return False + if fileext == '.lock': + print(f"{filename} is locked - skipping until unlocked") + return (False, f"{filename} is locked - skipping until unlocked") + + if fileext not in FILETYPES.keys(): + print(f"{fileext} not a supported file type for conversion. It will not be processed.") + move_source(new_destination_filename=target_doc, failed=True, remove=True) + return (False, f"{fileext} not a supported file type for conversion. It will not be processed.") + + FILETYPES[fileext]( + directory=directory, + filename=filename, + ext=fileext, + remove_on_complete=True # remove source document to save disk space. + ) + + return (True, None) diff --git a/collector/scripts/watch/utils.py b/collector/scripts/watch/utils.py index 6b66b07c..b15b0b3b 100644 --- a/collector/scripts/watch/utils.py +++ b/collector/scripts/watch/utils.py @@ -15,8 +15,13 @@ def file_creation_time(path_to_file): except AttributeError: return datetime.today().strftime('%Y-%m-%d %H:%M:%S') -def move_source(working_dir='hotdir', new_destination_filename= ''): - destination = f"{working_dir}/processed" +def move_source(working_dir='hotdir', new_destination_filename='', failed=False, remove=False): + if remove and os.path.exists(f"{working_dir}/{new_destination_filename}"): + print(f"{new_destination_filename} deleted from filesystem") + os.remove(f"{working_dir}/{new_destination_filename}") + return + + destination = f"{working_dir}/processed" if not failed else f"{working_dir}/failed" if os.path.exists(destination) == False: os.mkdir(destination) diff --git a/collector/wsgi.py b/collector/wsgi.py new file mode 100644 index 00000000..a6f402e6 --- /dev/null +++ b/collector/wsgi.py @@ -0,0 +1,4 @@ +from api import api + +if __name__ == '__main__': + api.run(debug=False) \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 29c07817..f69f041e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -4,7 +4,6 @@ FROM ubuntu:jammy-20230522 AS base # Build arguments ARG ARG_UID ARG ARG_GID -ARG ARG_CLOUD_BUILD=0 # Default to local docker build # Install system dependencies RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ @@ -32,13 +31,15 @@ RUN groupadd -g $ARG_GID anythingllm && \ useradd -u $ARG_UID -m -d /app -s /bin/bash -g anythingllm anythingllm && \ mkdir -p /app/frontend/ /app/server/ /app/collector/ && chown -R anythingllm:anythingllm /app -# Copy the docker entrypoint and healthcheck scripts +# Copy docker helper scripts COPY ./docker/docker-entrypoint.sh /usr/local/bin/ COPY ./docker/docker-healthcheck.sh /usr/local/bin/ +COPY ./docker/dual_boot.sh /usr/local/bin/ # Ensure the scripts are executable RUN chmod +x /usr/local/bin/docker-entrypoint.sh && \ - chmod +x /usr/local/bin/docker-healthcheck.sh + chmod +x /usr/local/bin/docker-healthcheck.sh && \ + chmod 777 /usr/local/bin/dual_boot.sh USER anythingllm @@ -89,18 +90,7 @@ EXPOSE 3001 HEALTHCHECK --interval=1m --timeout=10s --start-period=1m \ CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1 -# Docker will still install deps as root so need to force chown -# or else -USER root -RUN if [ "$ARG_CLOUD_BUILD" = 1 ] ; then \ - echo "Reowning all files as user!" && \ - mkdir -p app/server/storage app/server/storage/documents app/server/storage/vector-cache app/server/storage/lancedb && \ - touch anythingllm.db && \ - chown -R anythingllm:anythingllm /app/collector /app/server; \ - fi -USER anythingllm - # Run the server ENTRYPOINT ["docker-entrypoint.sh"] -CMD ["node", "/app/server/index.js"] +CMD /bin/bash /usr/local/bin/dual_boot.sh \ No newline at end of file diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 1958bbb0..9e5bff08 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -3,8 +3,6 @@ version: '3.9' networks: anything-llm: driver: bridge - # chroma_net: - # external: true services: anything-llm: @@ -17,7 +15,6 @@ services: args: ARG_UID: ${UID} ARG_GID: ${GID} - ARG_CLOUD_BUILD: ${CLOUD_BUILD} volumes: - "../server/storage:/app/server/storage" - "../collector/hotdir/:/app/collector/hotdir" @@ -29,4 +26,3 @@ services: - .env networks: - anything-llm - # - chroma_net diff --git a/docker/dual_boot.sh b/docker/dual_boot.sh new file mode 100644 index 00000000..37587178 --- /dev/null +++ b/docker/dual_boot.sh @@ -0,0 +1,5 @@ +#!/bin/bash +node /app/server/index.js & +{ FLASK_ENV=production FLASK_APP=wsgi.py cd collector && gunicorn --workers 4 --bind 0.0.0.0:8888 wsgi:api; } & +wait -n +exit $? \ No newline at end of file diff --git a/frontend/package.json b/frontend/package.json index 8b14670d..a5aad549 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -20,7 +20,9 @@ "react": "^18.2.0", "react-device-detect": "^2.2.2", "react-dom": "^18.2.0", + "react-dropzone": "^14.2.3", "react-feather": "^2.0.10", + "react-loading-icons": "^1.1.0", "react-loading-skeleton": "^3.1.0", "react-router-dom": "^6.3.0", "text-case": "^1.0.9", diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx index b332011d..7abc1fac 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx @@ -135,10 +135,10 @@ export default function Directory({ {showDetails && (+
{key}: {value}
); diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx index b6ce6924..cce5e2af 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx @@ -6,6 +6,7 @@ import { useParams } from "react-router-dom"; import Directory from "./Directory"; import ConfirmationModal from "./ConfirmationModal"; import CannotRemoveModal from "./CannotRemoveModal"; +import { AlertTriangle } from "react-feather"; export default function DocumentSettings({ workspace }) { const { slug } = useParams(); @@ -17,16 +18,21 @@ export default function DocumentSettings({ workspace }) { const [selectedFiles, setSelectFiles] = useState([]); const [vectordb, setVectorDB] = useState(null); const [showingNoRemovalModal, setShowingNoRemovalModal] = useState(false); + const [hasFiles, setHasFiles] = useState(true); useEffect(() => { async function fetchKeys() { const localFiles = await System.localFiles(); const settings = await System.keys(); const originalDocs = workspace.documents.map((doc) => doc.docpath) || []; + const hasAnyFiles = localFiles.items.some( + (folder) => folder?.items?.length > 0 + ); setDirectories(localFiles); setOriginalDocuments([...originalDocs]); setSelectFiles([...originalDocs]); setVectorDB(settings?.VectorDB); + setHasFiles(hasAnyFiles); setLoading(false); } fetchKeys(); @@ -162,6 +168,16 @@ export default function DocumentSettings({ workspace }) { )}+ You don't have any files uploaded. Upload a file via the "Upload + Docs" tab. +
+Select folders to add or remove from workspace. diff --git a/frontend/src/components/Modals/MangeWorkspace/Upload/FileUploadProgress/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Upload/FileUploadProgress/index.jsx new file mode 100644 index 00000000..8c5054c3 --- /dev/null +++ b/frontend/src/components/Modals/MangeWorkspace/Upload/FileUploadProgress/index.jsx @@ -0,0 +1,73 @@ +import React, { useState, useEffect, memo } from "react"; +import Workspace from "../../../../../models/workspace"; +import truncate from "truncate"; +import { humanFileSize, milliToHms } from "../../../../../utils/numbers"; +import { CheckCircle, XCircle } from "react-feather"; +import { Grid } from "react-loading-icons"; + +function FileUploadProgressComponent({ + slug, + file, + rejected = false, + reason = null, +}) { + const [timerMs, setTimerMs] = useState(10); + const [status, setStatus] = useState(file?.rejected ? "uploading" : "failed"); + + useEffect(() => { + async function uploadFile() { + const start = Number(new Date()); + const formData = new FormData(); + formData.append("file", file, file.name); + const timer = setInterval(() => { + setTimerMs(Number(new Date()) - start); + }, 100); + + // Chunk streaming not working in production so we just sit and wait + await Workspace.uploadFile(slug, formData); + setStatus("complete"); + clearInterval(timer); + } + !!file && !rejected && uploadFile(); + }, []); + + if (rejected) { + return ( +
+ {truncate(file.name, 30)} +
++ {reason} +
++ {truncate(file.name, 30)} +
++ {humanFileSize(file.size)} | {milliToHms(timerMs)} +
++ Checking document processor is online - please wait. +
++ this should only take a few moments. +
++ Document processor is offline. +
++ you cannot upload documents from the UI right now +
++ Click to upload or drag + and drop +
+ +
+ supported file extensions are{" "}
+
+ {Object.values(fileTypes).flat().join(" ")}
+
+
+ Add documents to your workspace. +
++ These files will be uploaded to the document processor running on + this AnythingLLM instance. These files are not sent or shared with + a third party. +
+ {process.env.NODE_ENV !== "production" && ( +
+ python document processor app
+
{" "}
+ running for these documents to process.
+