Enable web scraping based on a urtl and a simple filter. (#73)

This commit is contained in:
AntonioCiolino 2023-06-16 20:29:11 -04:00 committed by GitHub
parent 81b2159329
commit e7ba028497
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 2 deletions

View File

@ -1,7 +1,7 @@
import os
from InquirerPy import inquirer
from scripts.youtube import youtube
from scripts.link import link, links
from scripts.link import link, links, crawler
from scripts.substack import substack
from scripts.medium import medium
from scripts.gitbook import gitbook
@ -42,6 +42,7 @@ def main():
choices=[
{"name": "Single URL", "value": "Single URL"},
{"name": "Multiple URLs", "value": "Multiple URLs"},
{"name": "URL Crawler", "value": "URL Crawler"},
{"name": "Abort", "value": "Abort"},
],
).execute()
@ -51,6 +52,9 @@ def main():
if method == 'Multiple URLs':
links()
exit(0)
if method == 'URL Crawler':
crawler()
exit(0)
if method == 'Abort': exit(0)
if method == 'YouTube Channel':

View File

@ -4,6 +4,8 @@ from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost
import requests
from bs4 import BeautifulSoup
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link():
@ -64,6 +66,29 @@ def link():
print(f"////////////////////////////")
exit(0)
def crawler():
prompt = "Paste in root URI of the pages of interest: "
new_link = input(prompt)
filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
#extract this from the uri provided
root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
links = []
urls = new_link
links.append(new_link)
grab = requests.get(urls)
soup = BeautifulSoup(grab.text, 'html.parser')
# traverse paragraphs from soup
for link in soup.find_all("a"):
data = link.get('href').strip()
if filter_value in data:
print (data)
links.append(root_site + data)
else:
print (data + " does not apply for linking...")
#parse the links found
parse_links(links)
def links():
links = []
prompt = "Paste in the URL of an online article or blog: "
@ -86,7 +111,6 @@ def links():
parse_links(links)
# parse links from array
def parse_links(links):
totalTokens = 0