mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-14 02:20:12 +01:00
[FEAT] Website depth scraping data connector (#1191)
* WIP website depth scraping, (sort of works) * website depth data connector stable + add maxLinks option * linting + loading small ui tweak * refactor website depth data connector for stability, speed, & readability * patch: remove console log Guard clause on URL validitiy check reasonable overrides --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
b6be43be95
commit
612a7e1662
@ -1,5 +1,6 @@
|
||||
const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
|
||||
const { reqBody } = require("../utils/http");
|
||||
const { validURL } = require("../utils/url");
|
||||
|
||||
function extensions(app) {
|
||||
if (!app) return;
|
||||
@ -86,6 +87,25 @@ function extensions(app) {
|
||||
}
|
||||
);
|
||||
|
||||
app.post(
|
||||
"/ext/website-depth",
|
||||
[verifyPayloadIntegrity],
|
||||
async function (request, response) {
|
||||
try {
|
||||
const websiteDepth = require("../utils/extensions/WebsiteDepth");
|
||||
const { url, depth = 1, maxLinks = 20 } = reqBody(request);
|
||||
if (!validURL(url)) return { success: false, reason: "Not a valid URL." };
|
||||
|
||||
const scrapedData = await websiteDepth(url, depth, maxLinks);
|
||||
response.status(200).json({ success: true, data: scrapedData });
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
response.status(400).json({ success: false, reason: e.message });
|
||||
}
|
||||
return;
|
||||
}
|
||||
);
|
||||
|
||||
app.post(
|
||||
"/ext/confluence",
|
||||
[verifyPayloadIntegrity],
|
||||
|
153
collector/utils/extensions/WebsiteDepth/index.js
Normal file
153
collector/utils/extensions/WebsiteDepth/index.js
Normal file
@ -0,0 +1,153 @@
|
||||
const { v4 } = require("uuid");
|
||||
const {
|
||||
PuppeteerWebBaseLoader,
|
||||
} = require("langchain/document_loaders/web/puppeteer");
|
||||
const { default: slugify } = require("slugify");
|
||||
const { parse } = require("node-html-parser");
|
||||
const { writeToServerDocuments } = require("../../files");
|
||||
const { tokenizeString } = require("../../tokenizer");
|
||||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
|
||||
async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
|
||||
const baseUrl = new URL(startUrl).origin;
|
||||
const discoveredLinks = new Set();
|
||||
const pendingLinks = [startUrl];
|
||||
let currentLevel = 0;
|
||||
depth = depth < 1 ? 1 : depth;
|
||||
maxLinks = maxLinks < 1 ? 1 : maxLinks;
|
||||
|
||||
// Check depth and if there are any links left to scrape
|
||||
while (currentLevel < depth && pendingLinks.length > 0) {
|
||||
const newLinks = await getPageLinks(pendingLinks[0], baseUrl);
|
||||
pendingLinks.shift();
|
||||
|
||||
for (const link of newLinks) {
|
||||
if (!discoveredLinks.has(link)) {
|
||||
discoveredLinks.add(link);
|
||||
pendingLinks.push(link);
|
||||
}
|
||||
|
||||
// Exit out if we reach maxLinks
|
||||
if (discoveredLinks.size >= maxLinks) {
|
||||
return Array.from(discoveredLinks).slice(0, maxLinks);
|
||||
}
|
||||
}
|
||||
|
||||
if (pendingLinks.length === 0) {
|
||||
currentLevel++;
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(discoveredLinks);
|
||||
}
|
||||
|
||||
async function getPageLinks(url, baseUrl) {
|
||||
try {
|
||||
const loader = new PuppeteerWebBaseLoader(url, {
|
||||
launchOptions: { headless: "new" },
|
||||
gotoOptions: { waitUntil: "domcontentloaded" },
|
||||
});
|
||||
const docs = await loader.load();
|
||||
const html = docs[0].pageContent;
|
||||
const links = extractLinks(html, baseUrl);
|
||||
return links;
|
||||
} catch (error) {
|
||||
console.error(`Failed to get page links from ${url}.`, error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function extractLinks(html, baseUrl) {
|
||||
const root = parse(html);
|
||||
const links = root.querySelectorAll("a");
|
||||
const extractedLinks = new Set();
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.getAttribute("href");
|
||||
if (href) {
|
||||
const absoluteUrl = new URL(href, baseUrl).href;
|
||||
if (absoluteUrl.startsWith(baseUrl)) {
|
||||
extractedLinks.add(absoluteUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(extractedLinks);
|
||||
}
|
||||
|
||||
async function bulkScrapePages(links, outputFolder) {
|
||||
const scrapedData = [];
|
||||
|
||||
for (let i = 0; i < links.length; i++) {
|
||||
const link = links[i];
|
||||
console.log(`Scraping ${i + 1}/${links.length}: ${link}`);
|
||||
|
||||
try {
|
||||
const loader = new PuppeteerWebBaseLoader(link, {
|
||||
launchOptions: { headless: "new" },
|
||||
gotoOptions: { waitUntil: "domcontentloaded" },
|
||||
async evaluate(page, browser) {
|
||||
const result = await page.evaluate(() => document.body.innerText);
|
||||
await browser.close();
|
||||
return result;
|
||||
},
|
||||
});
|
||||
const docs = await loader.load();
|
||||
const content = docs[0].pageContent;
|
||||
|
||||
if (!content.length) {
|
||||
console.warn(`Empty content for ${link}. Skipping.`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const url = new URL(link);
|
||||
const filename = (url.host + "-" + url.pathname).replace(".", "_");
|
||||
|
||||
const data = {
|
||||
id: v4(),
|
||||
url: "file://" + slugify(filename) + ".html",
|
||||
title: slugify(filename) + ".html",
|
||||
docAuthor: "no author found",
|
||||
description: "No description found.",
|
||||
docSource: "URL link uploaded by the user.",
|
||||
chunkSource: `link://${link}`,
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
};
|
||||
|
||||
writeToServerDocuments(data, data.title, outputFolder);
|
||||
scrapedData.push(data);
|
||||
|
||||
console.log(`Successfully scraped ${link}.`);
|
||||
} catch (error) {
|
||||
console.error(`Failed to scrape ${link}.`, error);
|
||||
}
|
||||
}
|
||||
|
||||
return scrapedData;
|
||||
}
|
||||
|
||||
async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
|
||||
const websiteName = new URL(startUrl).hostname;
|
||||
const outputFolder = path.resolve(
|
||||
__dirname,
|
||||
`../../../../server/storage/documents/${slugify(websiteName)}`
|
||||
);
|
||||
|
||||
fs.mkdirSync(outputFolder, { recursive: true });
|
||||
|
||||
console.log("Discovering links...");
|
||||
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
|
||||
console.log(`Found ${linksToScrape.length} links to scrape.`);
|
||||
|
||||
console.log("Starting bulk scraping...");
|
||||
const scrapedData = await bulkScrapePages(linksToScrape, outputFolder);
|
||||
console.log(`Scraped ${scrapedData.length} pages.`);
|
||||
|
||||
return scrapedData;
|
||||
}
|
||||
|
||||
module.exports = websiteScraper;
|
@ -1,10 +1,12 @@
|
||||
import Github from "./github.svg";
|
||||
import YouTube from "./youtube.svg";
|
||||
import Link from "./link.svg";
|
||||
import Confluence from "./confluence.jpeg";
|
||||
|
||||
const ConnectorImages = {
|
||||
github: Github,
|
||||
youtube: YouTube,
|
||||
websiteDepth: Link,
|
||||
confluence: Confluence,
|
||||
};
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 9.5 KiB |
@ -0,0 +1,134 @@
|
||||
import React, { useState } from "react";
|
||||
import System from "@/models/system";
|
||||
import showToast from "@/utils/toast";
|
||||
import pluralize from "pluralize";
|
||||
|
||||
export default function WebsiteDepthOptions() {
|
||||
const [loading, setLoading] = useState(false);
|
||||
|
||||
const handleSubmit = async (e) => {
|
||||
e.preventDefault();
|
||||
const form = new FormData(e.target);
|
||||
|
||||
try {
|
||||
setLoading(true);
|
||||
showToast("Scraping website - this may take a while.", "info", {
|
||||
clear: true,
|
||||
autoClose: false,
|
||||
});
|
||||
|
||||
const { data, error } = await System.dataConnectors.websiteDepth.scrape({
|
||||
url: form.get("url"),
|
||||
depth: parseInt(form.get("depth")),
|
||||
maxLinks: parseInt(form.get("maxLinks")),
|
||||
});
|
||||
|
||||
if (!!error) {
|
||||
showToast(error, "error", { clear: true });
|
||||
setLoading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
showToast(
|
||||
`Successfully scraped ${data.length} ${pluralize(
|
||||
"page",
|
||||
data.length
|
||||
)}!`,
|
||||
"success",
|
||||
{ clear: true }
|
||||
);
|
||||
e.target.reset();
|
||||
setLoading(false);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
showToast(e.message, "error", { clear: true });
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="flex w-full">
|
||||
<div className="flex flex-col w-full px-1 md:pb-6 pb-16">
|
||||
<form className="w-full" onSubmit={handleSubmit}>
|
||||
<div className="w-full flex flex-col py-2">
|
||||
<div className="w-full flex flex-col gap-4">
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold">
|
||||
Website URL
|
||||
</label>
|
||||
<p className="text-xs font-normal text-white/50">
|
||||
URL of the website you want to scrape.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="url"
|
||||
name="url"
|
||||
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||
placeholder="https://example.com"
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
spellCheck={false}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold">Depth</label>
|
||||
<p className="text-xs font-normal text-white/50">
|
||||
This is the number of child-links that the worker should
|
||||
follow from the origin URL.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
name="depth"
|
||||
min="1"
|
||||
max="5"
|
||||
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||
required={true}
|
||||
defaultValue="1"
|
||||
/>
|
||||
</div>
|
||||
<div className="flex flex-col pr-10">
|
||||
<div className="flex flex-col gap-y-1 mb-4">
|
||||
<label className="text-white text-sm font-bold">
|
||||
Max Links
|
||||
</label>
|
||||
<p className="text-xs font-normal text-white/50">
|
||||
Maximum number of links to scrape.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
name="maxLinks"
|
||||
min="1"
|
||||
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||
required={true}
|
||||
defaultValue="20"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col gap-y-2 w-full pr-10">
|
||||
<button
|
||||
type="submit"
|
||||
disabled={loading}
|
||||
className={`mt-2 w-full ${
|
||||
loading ? "cursor-not-allowed animate-pulse" : ""
|
||||
} justify-center border border-slate-200 px-4 py-2 rounded-lg text-[#222628] text-sm font-bold items-center flex gap-x-2 bg-slate-200 hover:bg-slate-300 hover:text-slate-800 disabled:bg-slate-300 disabled:cursor-not-allowed`}
|
||||
>
|
||||
{loading ? "Scraping website..." : "Submit"}
|
||||
</button>
|
||||
{loading && (
|
||||
<p className="text-xs text-white/50">
|
||||
Once complete, all scraped pages will be available for embedding
|
||||
into workspaces in the document picker.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
@ -5,6 +5,7 @@ import YoutubeOptions from "./Connectors/Youtube";
|
||||
import ConfluenceOptions from "./Connectors/Confluence";
|
||||
import { useState } from "react";
|
||||
import ConnectorOption from "./ConnectorOption";
|
||||
import WebsiteDepthOptions from "./Connectors/WebsiteDepth";
|
||||
|
||||
export const DATA_CONNECTORS = {
|
||||
github: {
|
||||
@ -21,6 +22,12 @@ export const DATA_CONNECTORS = {
|
||||
"Import the transcription of an entire YouTube video from a link.",
|
||||
options: <YoutubeOptions />,
|
||||
},
|
||||
"website-depth": {
|
||||
name: "Bulk Link Scraper",
|
||||
image: ConnectorImages.websiteDepth,
|
||||
description: "Scrape a website and its sub-links up to a certain depth.",
|
||||
options: <WebsiteDepthOptions />,
|
||||
},
|
||||
confluence: {
|
||||
name: "Confluence",
|
||||
image: ConnectorImages.confluence,
|
||||
|
@ -60,6 +60,24 @@ const DataConnector = {
|
||||
});
|
||||
},
|
||||
},
|
||||
websiteDepth: {
|
||||
scrape: async ({ url, depth, maxLinks }) => {
|
||||
return await fetch(`${API_BASE}/ext/website-depth`, {
|
||||
method: "POST",
|
||||
headers: baseHeaders(),
|
||||
body: JSON.stringify({ url, depth, maxLinks }),
|
||||
})
|
||||
.then((res) => res.json())
|
||||
.then((res) => {
|
||||
if (!res.success) throw new Error(res.reason);
|
||||
return { data: res.data, error: null };
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error(e);
|
||||
return { data: null, error: e.message };
|
||||
});
|
||||
},
|
||||
},
|
||||
|
||||
confluence: {
|
||||
collect: async function ({ pageUrl, username, accessToken }) {
|
||||
|
@ -93,6 +93,27 @@ function extensionEndpoints(app) {
|
||||
}
|
||||
}
|
||||
);
|
||||
app.post(
|
||||
"/ext/website-depth",
|
||||
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
|
||||
async (request, response) => {
|
||||
try {
|
||||
const responseFromProcessor =
|
||||
await new CollectorApi().forwardExtensionRequest({
|
||||
endpoint: "/ext/website-depth",
|
||||
method: "POST",
|
||||
body: request.body,
|
||||
});
|
||||
await Telemetry.sendTelemetry("extension_invoked", {
|
||||
type: "website_depth",
|
||||
});
|
||||
response.status(200).json(responseFromProcessor);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
response.sendStatus(500).end();
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
module.exports = { extensionEndpoints };
|
||||
|
Loading…
Reference in New Issue
Block a user