From 61c415ef4dbe8cec4e72dddd07bc83d31a94f85b Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Wed, 24 Jul 2024 17:57:57 -0700 Subject: [PATCH] Enable web-scraping without `--cap-add` on Railway (#1960) * patch Render&Railway --- ...uild-and-push-render-deployment-image.yaml | 1 + collector/processLink/convert/generic.js | 4 +++ docker/render.Dockerfile | 28 +++++++++---------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build-and-push-render-deployment-image.yaml b/.github/workflows/build-and-push-render-deployment-image.yaml index 31cf8ddf8..4887f3fa9 100644 --- a/.github/workflows/build-and-push-render-deployment-image.yaml +++ b/.github/workflows/build-and-push-render-deployment-image.yaml @@ -17,6 +17,7 @@ on: - 'render.yaml' - 'embed/**/*' # Embed should be published to frontend (yarn build:publish) if any changes are introduced - 'server/utils/agents/aibitat/example/**/*' # Do not push new image for local dev testing of new aibitat images. + - 'docker/vex/*' # CVE exceptions we know are not in risk jobs: push_to_registries: diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index a05463abf..dbcd9fd96 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -57,6 +57,10 @@ async function getPageContent(link) { const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new", + args: [ + '--no-sandbox', + '--disable-setuid-sandbox' + ] }, gotoOptions: { waitUntil: "domcontentloaded", diff --git a/docker/render.Dockerfile b/docker/render.Dockerfile index cc18c3df1..a02b04e36 100644 --- a/docker/render.Dockerfile +++ b/docker/render.Dockerfile @@ -1,4 +1,4 @@ -# This is the dockerfile spefically to be used with Render.com docker deployments. Do not use +# This is the dockerfile spefically to be used with Render.com & Railway.app docker deployments. Do not use # locally or in other environments as it will not be supported. # Setup base image @@ -61,32 +61,30 @@ RUN yarn build && \ WORKDIR /app # Install server dependencies -FROM base as server-build +FROM base AS backend-build COPY ./server /app/server/ WORKDIR /app/server RUN yarn install --production --network-timeout 100000 && yarn cache clean WORKDIR /app -# Build collector deps (this also downloads proper chrome for collector in /app/.cache so that needs to be -# transferred properly in prod-build stage. -FROM base AS collector-build -COPY ./collector /app/collector +# Install collector dependencies (& puppeteer) +COPY ./collector/ ./collector/ WORKDIR /app/collector ENV PUPPETEER_DOWNLOAD_BASE_URL=https://storage.googleapis.com/chrome-for-testing-public RUN yarn install --production --network-timeout 100000 && yarn cache clean -WORKDIR /app -FROM base AS production-build +FROM backend-build AS production-build WORKDIR /app -# Copy the server -COPY --chown=anythingllm:anythingllm --from=server-build /app/server/ /app/server/ -# Copy built static frontend files to the server public directory COPY --chown=anythingllm:anythingllm --from=frontend-build /app/frontend/dist /app/server/public -# Copy the collector -COPY --chown=anythingllm:anythingllm --from=collector-build /app/collector/ /app/collector/ -COPY --chown=anythingllm:anythingllm --from=collector-build /app/.cache/puppeteer /app/.cache/puppeteer +USER root +RUN chown -R anythingllm:anythingllm /app/server && \ + chown -R anythingllm:anythingllm /app/collector +USER anythingllm + +# Chrome scraping fixes for puppeteer +# Fix path to chrome executable as the runner will assume the file is in `/root/.cache` +ENV PUPPETEER_EXECUTABLE_PATH=/app/.cache/puppeteer/chrome/linux-119.0.6045.105/chrome-linux64/chrome -# Setup the environment ENV NODE_ENV=production ENV ANYTHING_LLM_RUNTIME=docker ENV STORAGE_DIR=$STORAGE_DIR