Enable web-scraping without --cap-add on Railway (#1960)

* patch Render&Railway
This commit is contained in:
Timothy Carambat 2024-07-24 17:57:57 -07:00 committed by GitHub
parent a99deaa89e
commit 61c415ef4d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 18 additions and 15 deletions

View File

@ -17,6 +17,7 @@ on:
- 'render.yaml'
- 'embed/**/*' # Embed should be published to frontend (yarn build:publish) if any changes are introduced
- 'server/utils/agents/aibitat/example/**/*' # Do not push new image for local dev testing of new aibitat images.
- 'docker/vex/*' # CVE exceptions we know are not in risk
jobs:
push_to_registries:

View File

@ -57,6 +57,10 @@ async function getPageContent(link) {
const loader = new PuppeteerWebBaseLoader(link, {
launchOptions: {
headless: "new",
args: [
'--no-sandbox',
'--disable-setuid-sandbox'
]
},
gotoOptions: {
waitUntil: "domcontentloaded",

View File

@ -1,4 +1,4 @@
# This is the dockerfile spefically to be used with Render.com docker deployments. Do not use
# This is the dockerfile spefically to be used with Render.com & Railway.app docker deployments. Do not use
# locally or in other environments as it will not be supported.
# Setup base image
@ -61,32 +61,30 @@ RUN yarn build && \
WORKDIR /app
# Install server dependencies
FROM base as server-build
FROM base AS backend-build
COPY ./server /app/server/
WORKDIR /app/server
RUN yarn install --production --network-timeout 100000 && yarn cache clean
WORKDIR /app
# Build collector deps (this also downloads proper chrome for collector in /app/.cache so that needs to be
# transferred properly in prod-build stage.
FROM base AS collector-build
COPY ./collector /app/collector
# Install collector dependencies (& puppeteer)
COPY ./collector/ ./collector/
WORKDIR /app/collector
ENV PUPPETEER_DOWNLOAD_BASE_URL=https://storage.googleapis.com/chrome-for-testing-public
RUN yarn install --production --network-timeout 100000 && yarn cache clean
WORKDIR /app
FROM base AS production-build
FROM backend-build AS production-build
WORKDIR /app
# Copy the server
COPY --chown=anythingllm:anythingllm --from=server-build /app/server/ /app/server/
# Copy built static frontend files to the server public directory
COPY --chown=anythingllm:anythingllm --from=frontend-build /app/frontend/dist /app/server/public
# Copy the collector
COPY --chown=anythingllm:anythingllm --from=collector-build /app/collector/ /app/collector/
COPY --chown=anythingllm:anythingllm --from=collector-build /app/.cache/puppeteer /app/.cache/puppeteer
USER root
RUN chown -R anythingllm:anythingllm /app/server && \
chown -R anythingllm:anythingllm /app/collector
USER anythingllm
# Chrome scraping fixes for puppeteer
# Fix path to chrome executable as the runner will assume the file is in `/root/.cache`
ENV PUPPETEER_EXECUTABLE_PATH=/app/.cache/puppeteer/chrome/linux-119.0.6045.105/chrome-linux64/chrome
# Setup the environment
ENV NODE_ENV=production
ENV ANYTHING_LLM_RUNTIME=docker
ENV STORAGE_DIR=$STORAGE_DIR