Compare commits

...

7 Commits

Author SHA1 Message Date
Sean Hatfield 64e6a8dd47
Merge d7d36460b0 into d72f1af361 2024-04-27 14:56:11 +01:00
Timothy Carambat d72f1af361
Improve uploader experience (#1205)
* Improve uploader expierence
- Wipe upload container (fadeout) after uploading
- debounce fetchKeys by 1s

* patch unneded exports
2024-04-26 17:41:42 -07:00
Sean Hatfield 360f17cd58
[FIX] Move to Workspace popup UI bug fix (#1204)
fix for popup menu transparent container
2024-04-26 17:38:41 -07:00
shatfield4 d7d36460b0 linting + loading small ui tweak 2024-04-26 16:28:27 -07:00
shatfield4 b62327f80a Merge branch 'master' into 1190-feat-website-scraping-depth 2024-04-26 14:52:03 -07:00
shatfield4 a2a3104928 website depth data connector stable + add maxLinks option 2024-04-26 13:29:56 -07:00
shatfield4 0f157cfde5 WIP website depth scraping, (sort of works) 2024-04-25 17:51:16 -07:00
12 changed files with 436 additions and 13 deletions

View File

@ -86,6 +86,23 @@ function extensions(app) {
}
);
app.post(
"/ext/website-depth",
[verifyPayloadIntegrity],
async function (request, response) {
try {
const websiteDepth = require("../utils/extensions/WebsiteDepth");
const { url, depth, maxLinks } = reqBody(request);
const scrapedData = await websiteDepth(url, depth, maxLinks);
response.status(200).json({ success: true, data: scrapedData });
} catch (e) {
console.error(e);
response.status(400).json({ success: false, reason: e.message });
}
return;
}
);
app.post(
"/ext/confluence",
[verifyPayloadIntegrity],

View File

@ -0,0 +1,141 @@
const { v4 } = require("uuid");
const {
PuppeteerWebBaseLoader,
} = require("langchain/document_loaders/web/puppeteer");
const { default: slugify } = require("slugify");
const { parse } = require("node-html-parser");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const path = require("path");
const fs = require("fs");
async function websiteDepth(startUrl, depth = 1, maxLinks = 20) {
const scrapedData = [];
const visitedUrls = new Set();
const websiteName = new URL(startUrl).hostname;
const outputFolder = path.resolve(
__dirname,
`../../../../server/storage/documents/${slugify(websiteName)}`
);
if (!fs.existsSync(outputFolder)) {
fs.mkdirSync(outputFolder, { recursive: true });
}
async function scrapeLevel(currentLink, currentLevel) {
if (
currentLevel > depth ||
visitedUrls.has(currentLink) ||
visitedUrls.size >= maxLinks
)
return;
visitedUrls.add(currentLink);
console.log(`-- Working URL ${currentLink} --`);
const content = await getPageContent(currentLink);
if (!content.length) {
console.error(`Resulting URL content was empty at ${currentLink}.`);
return;
}
const url = new URL(currentLink);
const filename = (url.host + "-" + url.pathname).replace(".", "_");
const data = {
id: v4(),
url: "file://" + slugify(filename) + ".html",
title: slugify(filename) + ".html",
docAuthor: "no author found",
description: "No description found.",
docSource: "URL link uploaded by the user.",
chunkSource: `link://${currentLink}`,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};
scrapedData.push(data);
const links = extractLinks(await getPageHTML(currentLink), url.origin);
for (const link of links) {
if (visitedUrls.size >= maxLinks) break;
await scrapeLevel(link, currentLevel + 1);
}
}
await scrapeLevel(startUrl, 0);
for (const data of scrapedData) {
const document = writeToServerDocuments(
data,
`${data.title}`,
outputFolder
);
console.log(
`[SUCCESS]: URL ${data.chunkSource} converted & ready for embedding.\n`
);
}
return scrapedData;
}
async function getPageContent(link) {
try {
const loader = new PuppeteerWebBaseLoader(link, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerText);
await browser.close();
return result;
},
});
const docs = await loader.load();
return docs[0].pageContent;
} catch (error) {
console.error("getPageContent failed to be fetched by Puppeteer.", error);
return null;
}
}
async function getPageHTML(link) {
try {
const loader = new PuppeteerWebBaseLoader(link, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerHTML);
await browser.close();
return result;
},
});
const docs = await loader.load();
return docs[0].pageContent;
} catch (error) {
console.error("getPageHTML failed to be fetched by Puppeteer.", error);
return null;
}
}
function extractLinks(html, baseUrl) {
const root = parse(html);
const links = root.querySelectorAll("a");
const extractedLinks = new Set();
for (const link of links) {
const href = link.getAttribute("href");
if (href && (href.startsWith("/") || href.startsWith(baseUrl))) {
const absoluteUrl = new URL(href, baseUrl).href;
extractedLinks.add(absoluteUrl);
}
}
return Array.from(extractedLinks);
}
module.exports = websiteDepth;

View File

@ -1,10 +1,12 @@
import Github from "./github.svg";
import YouTube from "./youtube.svg";
import Link from "./link.svg";
import Confluence from "./confluence.jpeg";
const ConnectorImages = {
github: Github,
youtube: YouTube,
websiteDepth: Link,
confluence: Confluence,
};

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 9.5 KiB

View File

@ -0,0 +1,135 @@
import React, { useState } from "react";
import System from "@/models/system";
import showToast from "@/utils/toast";
import pluralize from "pluralize";
export default function WebsiteDepthOptions() {
const [loading, setLoading] = useState(false);
const handleSubmit = async (e) => {
e.preventDefault();
const form = new FormData(e.target);
try {
setLoading(true);
showToast("Scraping website - this may take a while.", "info", {
clear: true,
autoClose: false,
});
const { data, error } = await System.dataConnectors.websiteDepth.scrape({
url: form.get("url"),
depth: parseInt(form.get("depth")),
maxLinks: parseInt(form.get("maxLinks")),
});
console.log({ data, error });
if (!!error) {
showToast(error, "error", { clear: true });
setLoading(false);
return;
}
showToast(
`Successfully scraped ${data.length} ${pluralize(
"page",
data.length
)}!`,
"success",
{ clear: true }
);
e.target.reset();
setLoading(false);
} catch (e) {
console.error(e);
showToast(e.message, "error", { clear: true });
setLoading(false);
}
};
return (
<div className="flex w-full">
<div className="flex flex-col w-full px-1 md:pb-6 pb-16">
<form className="w-full" onSubmit={handleSubmit}>
<div className="w-full flex flex-col py-2">
<div className="w-full flex flex-col gap-4">
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Website URL
</label>
<p className="text-xs font-normal text-white/50">
URL of the website you want to scrape.
</p>
</div>
<input
type="url"
name="url"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
placeholder="https://example.com"
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">Depth</label>
<p className="text-xs font-normal text-white/50">
Depth of the website scraping (number of levels to scrape).
</p>
</div>
<input
type="number"
name="depth"
min="1"
max="5"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
required={true}
defaultValue="1"
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Max Links
</label>
<p className="text-xs font-normal text-white/50">
Maximum number of links to scrape.
</p>
</div>
<input
type="number"
name="maxLinks"
min="1"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
required={true}
defaultValue="20"
/>
</div>
</div>
</div>
<div className="flex flex-col gap-y-2 w-full pr-10">
<button
type="submit"
disabled={loading}
className={`mt-2 w-full ${
loading ? "cursor-not-allowed animate-pulse" : ""
} justify-center border border-slate-200 px-4 py-2 rounded-lg text-[#222628] text-sm font-bold items-center flex gap-x-2 bg-slate-200 hover:bg-slate-300 hover:text-slate-800 disabled:bg-slate-300 disabled:cursor-not-allowed`}
>
{loading ? "Scraping website..." : "Submit"}
</button>
{loading && (
<p className="text-xs text-white/50">
Once complete, all scraped pages will be available for embedding
into workspaces in the document picker.
</p>
)}
</div>
</form>
</div>
</div>
);
}

View File

@ -5,6 +5,7 @@ import YoutubeOptions from "./Connectors/Youtube";
import ConfluenceOptions from "./Connectors/Confluence";
import { useState } from "react";
import ConnectorOption from "./ConnectorOption";
import WebsiteDepthOptions from "./Connectors/WebsiteDepth";
export const DATA_CONNECTORS = {
github: {
@ -21,6 +22,13 @@ export const DATA_CONNECTORS = {
"Import the transcription of an entire YouTube video from a link.",
options: <YoutubeOptions />,
},
"website-depth": {
name: "Website Depth",
image: ConnectorImages.websiteDepth,
description:
"Scrape a website and its links on a page up to a certain depth.",
options: <WebsiteDepthOptions />,
},
confluence: {
name: "Confluence",
image: ConnectorImages.confluence,

View File

@ -261,8 +261,8 @@ function Directory({
)}
</div>
{amountSelected !== 0 && (
<div className="absolute bottom-[12px] left-0 right-0 flex justify-center">
<div className="mx-auto bg-white/40 rounded-lg py-1 px-2">
<div className="absolute bottom-[12px] left-0 right-0 flex justify-center pointer-events-none">
<div className="mx-auto bg-white/40 rounded-lg py-1 px-2 pointer-events-auto">
<div className="flex flex-row items-center gap-x-2">
<button
onClick={moveToWorkspace}

View File

@ -7,7 +7,9 @@ import PreLoader from "../../../../../Preloader";
function FileUploadProgressComponent({
slug,
uuid,
file,
setFiles,
rejected = false,
reason = null,
onUploadSuccess,
@ -18,6 +20,19 @@ function FileUploadProgressComponent({
const [timerMs, setTimerMs] = useState(10);
const [status, setStatus] = useState("pending");
const [error, setError] = useState("");
const [isFadingOut, setIsFadingOut] = useState(false);
const fadeOut = (cb) => {
setIsFadingOut(true);
cb?.();
};
const beginFadeOut = () => {
setIsFadingOut(false);
setFiles((prev) => {
return prev.filter((item) => item.uid !== uuid);
});
};
useEffect(() => {
async function uploadFile() {
@ -44,13 +59,22 @@ function FileUploadProgressComponent({
clearInterval(timer);
onUploadSuccess();
}
// Begin fadeout timer to clear uploader queue.
setTimeout(() => {
fadeOut(() => setTimeout(() => beginFadeOut(), 300));
}, 5000);
}
!!file && !rejected && uploadFile();
}, []);
if (rejected) {
return (
<div className="h-14 px-2 py-2 flex items-center gap-x-4 rounded-lg bg-white/5 border border-white/40">
<div
className={`${
isFadingOut ? "file-upload-fadeout" : "file-upload"
} h-14 px-2 py-2 flex items-center gap-x-4 rounded-lg bg-white/5 border border-white/40`}
>
<div className="w-6 h-6 flex-shrink-0">
<XCircle className="w-6 h-6 stroke-white bg-red-500 rounded-full p-1 w-full h-full" />
</div>
@ -66,7 +90,11 @@ function FileUploadProgressComponent({
if (status === "failed") {
return (
<div className="h-14 px-2 py-2 flex items-center gap-x-4 rounded-lg bg-white/5 border border-white/40 overflow-y-auto">
<div
className={`${
isFadingOut ? "file-upload-fadeout" : "file-upload"
} h-14 px-2 py-2 flex items-center gap-x-4 rounded-lg bg-white/5 border border-white/40 overflow-y-auto`}
>
<div className="w-6 h-6 flex-shrink-0">
<XCircle className="w-6 h-6 stroke-white bg-red-500 rounded-full p-1 w-full h-full" />
</div>
@ -81,7 +109,11 @@ function FileUploadProgressComponent({
}
return (
<div className="h-14 px-2 py-2 flex items-center gap-x-4 rounded-lg bg-white/5 border border-white/40">
<div
className={`${
isFadingOut ? "file-upload-fadeout" : "file-upload"
} h-14 px-2 py-2 flex items-center gap-x-4 rounded-lg bg-white/5 border border-white/40`}
>
<div className="w-6 h-6 flex-shrink-0">
{status !== "complete" ? (
<div className="flex items-center justify-center">

View File

@ -6,6 +6,7 @@ import { useDropzone } from "react-dropzone";
import { v4 } from "uuid";
import FileUploadProgress from "./FileUploadProgress";
import Workspace from "../../../../../models/workspace";
import debounce from "lodash.debounce";
export default function UploadFile({
workspace,
@ -39,14 +40,9 @@ export default function UploadFile({
setFetchingUrl(false);
};
const handleUploadSuccess = () => {
fetchKeys(true);
showToast("File uploaded successfully", "success", { clear: true });
};
const handleUploadError = (message) => {
showToast(`Error uploading file: ${message}`, "error");
};
// Don't spam fetchKeys, wait 1s between calls at least.
const handleUploadSuccess = debounce(() => fetchKeys(true), 1000);
const handleUploadError = (_msg) => null; // stubbed.
const onDrop = async (acceptedFiles, rejections) => {
const newAccepted = acceptedFiles.map((file) => {
@ -115,6 +111,8 @@ export default function UploadFile({
<FileUploadProgress
key={file.uid}
file={file.file}
uuid={file.uid}
setFiles={setFiles}
slug={workspace.slug}
rejected={file?.rejected}
reason={file?.reason}

View File

@ -692,3 +692,53 @@ does not extend the close button beyond the viewport. */
.text-tremor-content {
padding-bottom: 10px;
}
.file-upload {
-webkit-animation: fadein 0.3s linear forwards;
animation: fadein 0.3s linear forwards;
}
.file-upload-fadeout {
-webkit-animation: fadeout 0.3s linear forwards;
animation: fadeout 0.3s linear forwards;
}
@-webkit-keyframes fadein {
0% {
opacity: 0;
}
100% {
opacity: 1;
}
}
@keyframes fadein {
0% {
opacity: 0;
}
100% {
opacity: 1;
}
}
@-webkit-keyframes fadeout {
0% {
opacity: 1;
}
100% {
opacity: 0;
}
}
@keyframes fadeout {
0% {
opacity: 1;
}
100% {
opacity: 0;
}
}

View File

@ -60,6 +60,24 @@ const DataConnector = {
});
},
},
websiteDepth: {
scrape: async ({ url, depth, maxLinks }) => {
return await fetch(`${API_BASE}/ext/website-depth`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({ url, depth, maxLinks }),
})
.then((res) => res.json())
.then((res) => {
if (!res.success) throw new Error(res.reason);
return { data: res.data, error: null };
})
.catch((e) => {
console.error(e);
return { data: null, error: e.message };
});
},
},
confluence: {
collect: async function ({ pageUrl, username, accessToken }) {

View File

@ -93,6 +93,27 @@ function extensionEndpoints(app) {
}
}
);
app.post(
"/ext/website-depth",
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
async (request, response) => {
try {
const responseFromProcessor =
await new CollectorApi().forwardExtensionRequest({
endpoint: "/ext/website-depth",
method: "POST",
body: request.body,
});
await Telemetry.sendTelemetry("extension_invoked", {
type: "website_depth",
});
response.status(200).json(responseFromProcessor);
} catch (e) {
console.error(e);
response.sendStatus(500).end();
}
}
);
}
module.exports = { extensionEndpoints };