Enable web scraping based on a urtl and a simple filter. (#73)

2024-11-04 22:10:12 +01:00 · 2023-06-16 20:29:11 -04:00 · 2023-06-16 20:29:11 -04:00 · e7ba028497
commit e7ba028497
parent 81b2159329
2 changed files with 30 additions and 2 deletions
--- a/collector/main.py
+++ b/collector/main.py
@ -1,7 +1,7 @@
 import os
 from InquirerPy import inquirer
 from scripts.youtube import youtube
-from scripts.link import link, links
+from scripts.link import link, links, crawler
 from scripts.substack import substack
 from scripts.medium import medium
 from scripts.gitbook import gitbook
@ -42,6 +42,7 @@ def main():
      choices=[
        {"name": "Single URL", "value": "Single URL"},
        {"name": "Multiple URLs", "value": "Multiple URLs"},
+        {"name": "URL Crawler", "value": "URL Crawler"},
        {"name": "Abort", "value": "Abort"},
      ],
    ).execute()
@ -51,6 +52,9 @@ def main():
    if method == 'Multiple URLs':
      links()
      exit(0)
+    if method == 'URL Crawler':
+      crawler()
+      exit(0)

  if method == 'Abort': exit(0)
  if method == 'YouTube Channel':
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@ -4,6 +4,8 @@ from requests_html import HTMLSession
 from langchain.document_loaders import UnstructuredHTMLLoader
 from .link_utils import  append_meta
 from .utils import tokenize, ada_v2_cost
+import requests
+from bs4 import BeautifulSoup
    
 # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
 def link():
@ -64,6 +66,29 @@ def link():
  print(f"////////////////////////////")
  exit(0)

+def crawler():
+  prompt = "Paste in root URI of the pages of interest: "
+  new_link = input(prompt)
+  filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
+  #extract this from the uri provided
+  root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
+  links = []
+  urls = new_link
+  links.append(new_link)
+  grab = requests.get(urls)
+  soup = BeautifulSoup(grab.text, 'html.parser')
+
+  # traverse paragraphs from soup
+  for link in soup.find_all("a"):
+    data = link.get('href').strip()
+    if filter_value in data:
+      print (data)
+      links.append(root_site + data)
+    else:
+       print (data + " does not apply for linking...")
+  #parse the links found  
+  parse_links(links)
+
 def links():
  links = []
  prompt = "Paste in the URL of an online article or blog: "
@ -86,7 +111,6 @@ def links():
  parse_links(links)


-
 # parse links from array
 def parse_links(links):
    totalTokens = 0