mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 20:50:09 +01:00
Merge branch 'master' of github.com:Mintplex-Labs/anything-llm into render
This commit is contained in:
commit
c8dac6177a
@ -76,7 +76,7 @@ function extractLinks(html, baseUrl) {
|
|||||||
return Array.from(extractedLinks);
|
return Array.from(extractedLinks);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function bulkScrapePages(links, outputFolder) {
|
async function bulkScrapePages(links, outFolderPath) {
|
||||||
const scrapedData = [];
|
const scrapedData = [];
|
||||||
|
|
||||||
for (let i = 0; i < links.length; i++) {
|
for (let i = 0; i < links.length; i++) {
|
||||||
@ -118,7 +118,7 @@ async function bulkScrapePages(links, outputFolder) {
|
|||||||
token_count_estimate: tokenizeString(content).length,
|
token_count_estimate: tokenizeString(content).length,
|
||||||
};
|
};
|
||||||
|
|
||||||
writeToServerDocuments(data, data.title, outputFolder);
|
writeToServerDocuments(data, data.title, outFolderPath);
|
||||||
scrapedData.push(data);
|
scrapedData.push(data);
|
||||||
|
|
||||||
console.log(`Successfully scraped ${link}.`);
|
console.log(`Successfully scraped ${link}.`);
|
||||||
@ -132,19 +132,25 @@ async function bulkScrapePages(links, outputFolder) {
|
|||||||
|
|
||||||
async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
|
async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
|
||||||
const websiteName = new URL(startUrl).hostname;
|
const websiteName = new URL(startUrl).hostname;
|
||||||
const outputFolder = path.resolve(
|
const outFolder = slugify(
|
||||||
|
`${slugify(websiteName)}-${v4().slice(0, 4)}`
|
||||||
|
).toLowerCase();
|
||||||
|
const outFolderPath =
|
||||||
|
process.env.NODE_ENV === "development"
|
||||||
|
? path.resolve(
|
||||||
__dirname,
|
__dirname,
|
||||||
`../../../../server/storage/documents/${slugify(websiteName)}`
|
`../../../../server/storage/documents/${outFolder}`
|
||||||
);
|
)
|
||||||
|
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
|
||||||
fs.mkdirSync(outputFolder, { recursive: true });
|
|
||||||
|
|
||||||
console.log("Discovering links...");
|
console.log("Discovering links...");
|
||||||
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
|
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
|
||||||
console.log(`Found ${linksToScrape.length} links to scrape.`);
|
console.log(`Found ${linksToScrape.length} links to scrape.`);
|
||||||
|
|
||||||
|
if (!fs.existsSync(outFolderPath))
|
||||||
|
fs.mkdirSync(outFolderPath, { recursive: true });
|
||||||
console.log("Starting bulk scraping...");
|
console.log("Starting bulk scraping...");
|
||||||
const scrapedData = await bulkScrapePages(linksToScrape, outputFolder);
|
const scrapedData = await bulkScrapePages(linksToScrape, outFolderPath);
|
||||||
console.log(`Scraped ${scrapedData.length} pages.`);
|
console.log(`Scraped ${scrapedData.length} pages.`);
|
||||||
|
|
||||||
return scrapedData;
|
return scrapedData;
|
||||||
|
Loading…
Reference in New Issue
Block a user