anything-llm/server/utils/agents/aibitat/plugins/web-scraping.js

const { CollectorApi } = require("../../../collectorApi");
const { summarizeContent } = require("../utils/summarize");

const webScraping = {
  name: "web-scraping",
  startupConfig: {
    params: {},
  },
  plugin: function () {
    return {
      name: this.name,
      setup(aibitat) {
        aibitat.function({
          super: aibitat,
          name: this.name,
          controller: new AbortController(),
          description:
            "Scrapes the content of a webpage or online resource from a URL.",
          parameters: {
            $schema: "http://json-schema.org/draft-07/schema#",
            type: "object",
            properties: {
              url: {
                type: "string",
                format: "uri",
                description: "A web URL.",
              },
            },
            additionalProperties: false,
          },
          handler: async function ({ url }) {
            try {
              if (url) return await this.scrape(url);
              return "There is nothing we can do. This function call returns no information.";
            } catch (error) {
              return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
            }
          },

          /**
           * Scrape a website and summarize the content based on objective if the content is too large.
           * Objective is the original objective & task that user give to the agent, url is the url of the website to be scraped.
           * Here we can leverage the document collector to get raw website text quickly.
           *
           * @param url
           * @returns
           */
          scrape: async function (url) {
            this.super.introspect(
              `${this.caller}: Scraping the content of ${url}`
            );
            const { success, content } =
              await new CollectorApi().getLinkContent(url);

            if (!success) {
              this.super.introspect(
                `${this.caller}: could not scrape ${url}. I can't use this page's content.`
              );
              throw new Error(
                `URL could not be scraped and no content was found.`
              );
            }

            if (content?.length <= 8000) {
              return content;
            }

            this.super.introspect(
              `${this.caller}: This page's content is way too long. I will summarize it right now.`
            );
            this.super.onAbort(() => {
              this.super.handlerProps.log(
                "Abort was triggered, exiting summarization early."
              );
              this.controller.abort();
            });
            return summarizeContent(this.controller.signal, content);
          },
        });
      },
    };
  },
};

module.exports = {
  webScraping,
};
Agent support for `@agent` default agent inside workspace chat (#1093) V1 of agent support via built-in `@agent` that can be invoked alongside normal workspace RAG chat. 2024-04-16 19:50:10 +02:00			`const { CollectorApi } = require("../../../collectorApi");`
			`const { summarizeContent } = require("../utils/summarize");`

			`const webScraping = {`
			`name: "web-scraping",`
			`startupConfig: {`
			`params: {},`
			`},`
			`plugin: function () {`
			`return {`
			`name: this.name,`
			`setup(aibitat) {`
			`aibitat.function({`
			`super: aibitat,`
			`name: this.name,`
			`controller: new AbortController(),`
			`description:`
			`"Scrapes the content of a webpage or online resource from a URL.",`
			`parameters: {`
			`$schema: "http://json-schema.org/draft-07/schema#",`
			`type: "object",`
			`properties: {`
			`url: {`
			`type: "string",`
			`format: "uri",`
			`description: "A web URL.",`
			`},`
			`},`
			`additionalProperties: false,`
			`},`
			`handler: async function ({ url }) {`
			`try {`
			`if (url) return await this.scrape(url);`
			`return "There is nothing we can do. This function call returns no information.";`
			`} catch (error) {`
			return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
			`}`
			`},`

			`/**`
			`* Scrape a website and summarize the content based on objective if the content is too large.`
			`* Objective is the original objective & task that user give to the agent, url is the url of the website to be scraped.`
			`* Here we can leverage the document collector to get raw website text quickly.`
			`*`
			`* @param url`
			`* @returns`
			`*/`
			`scrape: async function (url) {`
			`this.super.introspect(`
			`${this.caller}: Scraping the content of ${url}`
			`);`
			`const { success, content } =`
			`await new CollectorApi().getLinkContent(url);`

			`if (!success) {`
			`this.super.introspect(`
			`${this.caller}: could not scrape ${url}. I can't use this page's content.`
			`);`
			`throw new Error(`
			`URL could not be scraped and no content was found.`
			`);`
			`}`

			`if (content?.length <= 8000) {`
			`return content;`
			`}`

			`this.super.introspect(`
			`${this.caller}: This page's content is way too long. I will summarize it right now.`
			`);`
			`this.super.onAbort(() => {`
			`this.super.handlerProps.log(`
			`"Abort was triggered, exiting summarization early."`
			`);`
			`this.controller.abort();`
			`});`
			`return summarizeContent(this.controller.signal, content);`
			`},`
			`});`
			`},`
			`};`
			`},`
			`};`

			`module.exports = {`
			`webScraping,`
			`};`