Enable web-scraping without --cap-add on Railway (#1960)

* patch Render&Railway
2024-11-19 12:40:09 +01:00 · 2024-07-24 17:57:57 -07:00 · 2024-07-24 17:57:57 -07:00 · 61c415ef4d
commit 61c415ef4d
parent a99deaa89e
3 changed files with 18 additions and 15 deletions
--- a/.github/workflows/build-and-push-render-deployment-image.yaml
+++ b/.github/workflows/build-and-push-render-deployment-image.yaml
@ -17,6 +17,7 @@ on:
      - 'render.yaml'
      - 'embed/**/*' # Embed should be published to frontend (yarn build:publish) if any changes are introduced
      - 'server/utils/agents/aibitat/example/**/*' # Do not push new image for local dev testing of new aibitat images.
+      - 'docker/vex/*' # CVE exceptions we know are not in risk

 jobs:
  push_to_registries:
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -57,6 +57,10 @@ async function getPageContent(link) {
    const loader = new PuppeteerWebBaseLoader(link, {
      launchOptions: {
        headless: "new",
+        args: [
+          '--no-sandbox',
+          '--disable-setuid-sandbox'
+        ]
      },
      gotoOptions: {
        waitUntil: "domcontentloaded",
--- a/docker/render.Dockerfile
+++ b/docker/render.Dockerfile
@ -1,4 +1,4 @@
-# This is the dockerfile spefically to be used with Render.com docker deployments. Do not use
+# This is the dockerfile spefically to be used with Render.com & Railway.app docker deployments. Do not use
 # locally or in other environments as it will not be supported.

 # Setup base image
@ -61,32 +61,30 @@ RUN yarn build && \
 WORKDIR /app

 # Install server dependencies
-FROM base as server-build
+FROM base AS backend-build
 COPY ./server /app/server/
 WORKDIR /app/server
 RUN yarn install --production --network-timeout 100000 && yarn cache clean
 WORKDIR /app

-# Build collector deps (this also downloads proper chrome for collector in /app/.cache so that needs to be
-# transferred properly in prod-build stage.
-FROM base AS collector-build
-COPY ./collector /app/collector
+# Install collector dependencies (& puppeteer)
+COPY ./collector/ ./collector/
 WORKDIR /app/collector
 ENV PUPPETEER_DOWNLOAD_BASE_URL=https://storage.googleapis.com/chrome-for-testing-public 
 RUN yarn install --production --network-timeout 100000 && yarn cache clean
-WORKDIR /app

-FROM base AS production-build
+FROM backend-build AS production-build
 WORKDIR /app
-# Copy the server 
-COPY --chown=anythingllm:anythingllm --from=server-build /app/server/ /app/server/
-# Copy built static frontend files to the server public directory
 COPY --chown=anythingllm:anythingllm --from=frontend-build /app/frontend/dist /app/server/public
-# Copy the collector
-COPY --chown=anythingllm:anythingllm --from=collector-build /app/collector/ /app/collector/
-COPY --chown=anythingllm:anythingllm --from=collector-build /app/.cache/puppeteer /app/.cache/puppeteer
+USER root
+RUN chown -R anythingllm:anythingllm /app/server && \
+    chown -R anythingllm:anythingllm /app/collector
+USER anythingllm
+
+# Chrome scraping fixes for puppeteer
+# Fix path to chrome executable as the runner will assume the file is in `/root/.cache`
+ENV PUPPETEER_EXECUTABLE_PATH=/app/.cache/puppeteer/chrome/linux-119.0.6045.105/chrome-linux64/chrome

-# Setup the environment
 ENV NODE_ENV=production
 ENV ANYTHING_LLM_RUNTIME=docker
 ENV STORAGE_DIR=$STORAGE_DIR