const { v4 } = require("uuid"); const { PuppeteerWebBaseLoader, } = require("langchain/document_loaders/web/puppeteer"); const { writeToServerDocuments } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); async function scrapeGenericUrl(link) { console.log(`-- Working URL ${link} --`); const content = await getPageContent(link); if (!content.length) { console.error(`Resulting URL content was empty at ${link}.`); return { success: false, reason: `No URL content found at ${link}.`, documents: [], }; } const url = new URL(link); const filename = (url.host + "-" + url.pathname).replace(".", "_"); const data = { id: v4(), url: "file://" + slugify(filename) + ".html", title: slugify(filename) + ".html", docAuthor: "no author found", description: "No description found.", docSource: "URL link uploaded by the user.", chunkSource: `link://${link}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, token_count_estimate: tokenizeString(content).length, }; const document = writeToServerDocuments( data, `url-${slugify(filename)}-${data.id}` ); console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; } async function getPageContent(link) { try { let pageContents = []; const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new", }, gotoOptions: { waitUntil: "domcontentloaded", }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); await browser.close(); return result; }, }); const docs = await loader.load(); for (const doc of docs) { pageContents.push(doc.pageContent); } return pageContents.join(" "); } catch (error) { console.error("getPageContent failed!", error); } return null; } module.exports = { scrapeGenericUrl, };