2024-04-16 19:50:10 +02:00
|
|
|
const { CollectorApi } = require("../../../collectorApi");
|
2024-04-17 23:04:51 +02:00
|
|
|
const Provider = require("../providers/ai-provider");
|
2024-04-16 19:50:10 +02:00
|
|
|
const { summarizeContent } = require("../utils/summarize");
|
|
|
|
|
|
|
|
const webScraping = {
|
|
|
|
name: "web-scraping",
|
|
|
|
startupConfig: {
|
|
|
|
params: {},
|
|
|
|
},
|
|
|
|
plugin: function () {
|
|
|
|
return {
|
|
|
|
name: this.name,
|
|
|
|
setup(aibitat) {
|
|
|
|
aibitat.function({
|
|
|
|
super: aibitat,
|
|
|
|
name: this.name,
|
|
|
|
controller: new AbortController(),
|
|
|
|
description:
|
2024-05-08 01:35:47 +02:00
|
|
|
"Scrapes the content of a webpage or online resource from a provided URL.",
|
|
|
|
examples: [
|
|
|
|
{
|
2024-07-22 20:05:34 +02:00
|
|
|
prompt: "What is anythingllm.com about?",
|
|
|
|
call: JSON.stringify({ url: "https://anythingllm.com" }),
|
2024-05-08 01:35:47 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
prompt: "Scrape https://example.com",
|
2024-05-23 17:29:25 +02:00
|
|
|
call: JSON.stringify({ url: "https://example.com" }),
|
2024-05-08 01:35:47 +02:00
|
|
|
},
|
|
|
|
],
|
2024-04-16 19:50:10 +02:00
|
|
|
parameters: {
|
|
|
|
$schema: "http://json-schema.org/draft-07/schema#",
|
|
|
|
type: "object",
|
|
|
|
properties: {
|
|
|
|
url: {
|
|
|
|
type: "string",
|
|
|
|
format: "uri",
|
2024-05-08 01:35:47 +02:00
|
|
|
description:
|
|
|
|
"A complete web address URL including protocol. Assumes https if not provided.",
|
2024-04-16 19:50:10 +02:00
|
|
|
},
|
|
|
|
},
|
|
|
|
additionalProperties: false,
|
|
|
|
},
|
|
|
|
handler: async function ({ url }) {
|
|
|
|
try {
|
|
|
|
if (url) return await this.scrape(url);
|
|
|
|
return "There is nothing we can do. This function call returns no information.";
|
|
|
|
} catch (error) {
|
|
|
|
return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
|
|
|
|
}
|
|
|
|
},
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Scrape a website and summarize the content based on objective if the content is too large.
|
|
|
|
* Objective is the original objective & task that user give to the agent, url is the url of the website to be scraped.
|
|
|
|
* Here we can leverage the document collector to get raw website text quickly.
|
|
|
|
*
|
|
|
|
* @param url
|
|
|
|
* @returns
|
|
|
|
*/
|
|
|
|
scrape: async function (url) {
|
|
|
|
this.super.introspect(
|
|
|
|
`${this.caller}: Scraping the content of ${url}`
|
|
|
|
);
|
|
|
|
const { success, content } =
|
|
|
|
await new CollectorApi().getLinkContent(url);
|
|
|
|
|
|
|
|
if (!success) {
|
|
|
|
this.super.introspect(
|
|
|
|
`${this.caller}: could not scrape ${url}. I can't use this page's content.`
|
|
|
|
);
|
|
|
|
throw new Error(
|
|
|
|
`URL could not be scraped and no content was found.`
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2024-04-17 23:04:51 +02:00
|
|
|
if (!content || content?.length === 0) {
|
|
|
|
throw new Error("There was no content to be collected or read.");
|
|
|
|
}
|
|
|
|
|
2024-08-15 21:13:28 +02:00
|
|
|
const { TokenManager } = require("../../../helpers/tiktoken");
|
|
|
|
if (
|
|
|
|
new TokenManager(this.super.model).countFromString(content) <
|
|
|
|
Provider.contextLimit(this.super.provider, this.super.model)
|
|
|
|
) {
|
2024-04-16 19:50:10 +02:00
|
|
|
return content;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.super.introspect(
|
|
|
|
`${this.caller}: This page's content is way too long. I will summarize it right now.`
|
|
|
|
);
|
|
|
|
this.super.onAbort(() => {
|
|
|
|
this.super.handlerProps.log(
|
|
|
|
"Abort was triggered, exiting summarization early."
|
|
|
|
);
|
|
|
|
this.controller.abort();
|
|
|
|
});
|
2024-06-10 23:31:39 +02:00
|
|
|
|
|
|
|
return summarizeContent({
|
|
|
|
provider: this.super.provider,
|
|
|
|
model: this.super.model,
|
|
|
|
controllerSignal: this.controller.signal,
|
|
|
|
content,
|
|
|
|
});
|
2024-04-16 19:50:10 +02:00
|
|
|
},
|
|
|
|
});
|
|
|
|
},
|
|
|
|
};
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
module.exports = {
|
|
|
|
webScraping,
|
|
|
|
};
|