patch:update storage for bulk-website scraper for render

This commit is contained in:
timothycarambat 2024-05-14 12:59:14 -07:00
parent c8dac6177a
commit d603d0fd51

View File

@ -4,7 +4,7 @@ const {
} = require("langchain/document_loaders/web/puppeteer"); } = require("langchain/document_loaders/web/puppeteer");
const { default: slugify } = require("slugify"); const { default: slugify } = require("slugify");
const { parse } = require("node-html-parser"); const { parse } = require("node-html-parser");
const { writeToServerDocuments } = require("../../files"); const { writeToServerDocuments, documentsFolder } = require("../../files");
const { tokenizeString } = require("../../tokenizer"); const { tokenizeString } = require("../../tokenizer");
const path = require("path"); const path = require("path");
const fs = require("fs"); const fs = require("fs");
@ -135,20 +135,14 @@ async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
const outFolder = slugify( const outFolder = slugify(
`${slugify(websiteName)}-${v4().slice(0, 4)}` `${slugify(websiteName)}-${v4().slice(0, 4)}`
).toLowerCase(); ).toLowerCase();
const outFolderPath = const outFolderPath = path.resolve(documentsFolder, outFolder);
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
console.log("Discovering links..."); console.log("Discovering links...");
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks); const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
console.log(`Found ${linksToScrape.length} links to scrape.`); console.log(`Found ${linksToScrape.length} links to scrape.`);
if (!fs.existsSync(outFolderPath)) if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath, { recursive: true });
fs.mkdirSync(outFolderPath, { recursive: true });
console.log("Starting bulk scraping..."); console.log("Starting bulk scraping...");
const scrapedData = await bulkScrapePages(linksToScrape, outFolderPath); const scrapedData = await bulkScrapePages(linksToScrape, outFolderPath);
console.log(`Scraped ${scrapedData.length} pages.`); console.log(`Scraped ${scrapedData.length} pages.`);