mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 20:50:09 +01:00
patch:update storage for bulk-website scraper for render
This commit is contained in:
parent
c8dac6177a
commit
d603d0fd51
@ -4,7 +4,7 @@ const {
|
|||||||
} = require("langchain/document_loaders/web/puppeteer");
|
} = require("langchain/document_loaders/web/puppeteer");
|
||||||
const { default: slugify } = require("slugify");
|
const { default: slugify } = require("slugify");
|
||||||
const { parse } = require("node-html-parser");
|
const { parse } = require("node-html-parser");
|
||||||
const { writeToServerDocuments } = require("../../files");
|
const { writeToServerDocuments, documentsFolder } = require("../../files");
|
||||||
const { tokenizeString } = require("../../tokenizer");
|
const { tokenizeString } = require("../../tokenizer");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
@ -135,20 +135,14 @@ async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
|
|||||||
const outFolder = slugify(
|
const outFolder = slugify(
|
||||||
`${slugify(websiteName)}-${v4().slice(0, 4)}`
|
`${slugify(websiteName)}-${v4().slice(0, 4)}`
|
||||||
).toLowerCase();
|
).toLowerCase();
|
||||||
const outFolderPath =
|
const outFolderPath = path.resolve(documentsFolder, outFolder);
|
||||||
process.env.NODE_ENV === "development"
|
|
||||||
? path.resolve(
|
|
||||||
__dirname,
|
|
||||||
`../../../../server/storage/documents/${outFolder}`
|
|
||||||
)
|
|
||||||
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
|
|
||||||
|
|
||||||
console.log("Discovering links...");
|
console.log("Discovering links...");
|
||||||
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
|
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
|
||||||
console.log(`Found ${linksToScrape.length} links to scrape.`);
|
console.log(`Found ${linksToScrape.length} links to scrape.`);
|
||||||
|
|
||||||
if (!fs.existsSync(outFolderPath))
|
if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath, { recursive: true });
|
||||||
fs.mkdirSync(outFolderPath, { recursive: true });
|
|
||||||
console.log("Starting bulk scraping...");
|
console.log("Starting bulk scraping...");
|
||||||
const scrapedData = await bulkScrapePages(linksToScrape, outFolderPath);
|
const scrapedData = await bulkScrapePages(linksToScrape, outFolderPath);
|
||||||
console.log(`Scraped ${scrapedData.length} pages.`);
|
console.log(`Scraped ${scrapedData.length} pages.`);
|
||||||
|
Loading…
Reference in New Issue
Block a user