mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-04 22:10:12 +01:00
Enable web scraping based on a urtl and a simple filter. (#73)
This commit is contained in:
parent
81b2159329
commit
e7ba028497
@ -1,7 +1,7 @@
|
||||
import os
|
||||
from InquirerPy import inquirer
|
||||
from scripts.youtube import youtube
|
||||
from scripts.link import link, links
|
||||
from scripts.link import link, links, crawler
|
||||
from scripts.substack import substack
|
||||
from scripts.medium import medium
|
||||
from scripts.gitbook import gitbook
|
||||
@ -42,6 +42,7 @@ def main():
|
||||
choices=[
|
||||
{"name": "Single URL", "value": "Single URL"},
|
||||
{"name": "Multiple URLs", "value": "Multiple URLs"},
|
||||
{"name": "URL Crawler", "value": "URL Crawler"},
|
||||
{"name": "Abort", "value": "Abort"},
|
||||
],
|
||||
).execute()
|
||||
@ -51,6 +52,9 @@ def main():
|
||||
if method == 'Multiple URLs':
|
||||
links()
|
||||
exit(0)
|
||||
if method == 'URL Crawler':
|
||||
crawler()
|
||||
exit(0)
|
||||
|
||||
if method == 'Abort': exit(0)
|
||||
if method == 'YouTube Channel':
|
||||
|
@ -4,6 +4,8 @@ from requests_html import HTMLSession
|
||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||
from .link_utils import append_meta
|
||||
from .utils import tokenize, ada_v2_cost
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
|
||||
def link():
|
||||
@ -64,6 +66,29 @@ def link():
|
||||
print(f"////////////////////////////")
|
||||
exit(0)
|
||||
|
||||
def crawler():
|
||||
prompt = "Paste in root URI of the pages of interest: "
|
||||
new_link = input(prompt)
|
||||
filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
|
||||
#extract this from the uri provided
|
||||
root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
|
||||
links = []
|
||||
urls = new_link
|
||||
links.append(new_link)
|
||||
grab = requests.get(urls)
|
||||
soup = BeautifulSoup(grab.text, 'html.parser')
|
||||
|
||||
# traverse paragraphs from soup
|
||||
for link in soup.find_all("a"):
|
||||
data = link.get('href').strip()
|
||||
if filter_value in data:
|
||||
print (data)
|
||||
links.append(root_site + data)
|
||||
else:
|
||||
print (data + " does not apply for linking...")
|
||||
#parse the links found
|
||||
parse_links(links)
|
||||
|
||||
def links():
|
||||
links = []
|
||||
prompt = "Paste in the URL of an online article or blog: "
|
||||
@ -86,7 +111,6 @@ def links():
|
||||
parse_links(links)
|
||||
|
||||
|
||||
|
||||
# parse links from array
|
||||
def parse_links(links):
|
||||
totalTokens = 0
|
||||
|
Loading…
Reference in New Issue
Block a user