Adds ability to import sitemaps to include a website (#51)

* Adds ability to import sitemaps to include a website * adds example sitemap url
2024-11-19 20:50:09 +01:00 · 2023-06-14 13:04:17 -05:00 · 2023-06-14 13:04:17 -05:00 · bd32f97a21
commit bd32f97a21
parent 040e0d3df7
3 changed files with 96 additions and 48 deletions
--- a/collector/main.py
+++ b/collector/main.py
@ -5,6 +5,7 @@ from scripts.link import link, links
 from scripts.substack import substack
 from scripts.medium import medium
 from scripts.gitbook import gitbook
 from scripts.sitemap import sitemap
 def main():
  if os.name == 'nt':
@ -13,7 +14,8 @@ def main():
      '2': 'Article or Blog Link',
      '3': 'Substack',
      '4': 'Medium',
-      '5': 'Gitbook'
+      '5': 'Gitbook',
      '6': 'Sitemap',
    }
    print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
    print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
@ -29,6 +31,7 @@ def main():
        {"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
        {"name": "Gitbook", "value": "Gitbook"},
        {"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
        {"name": "Sitemap", "value": "Sitemap"},
        {"name": "Abort", "value": "Abort"},
      ],
    ).execute()
@ -62,6 +65,9 @@ def main():
  if method == 'Gitbook':
    gitbook()
    exit(0)
  if method == 'Sitemap':
    sitemap()
    exit(0)
  print("Selection was not valid.")
  exit(1)
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@ -4,6 +4,7 @@ from requests_html import HTMLSession
 from langchain.document_loaders import UnstructuredHTMLLoader
 from .link_utils import  append_meta
 from .utils import tokenize, ada_v2_cost
 from requests.exceptions import ReadTimeout
 # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
 def link():
@ -83,57 +84,71 @@ def links():
    print("No valid links provided!")
    exit(1)
-  totalTokens = 0
+  parse_links(links)
  for link in links:
    print(f"Working on {link}...")
    session = HTMLSession()
    req = session.get(link)
    if(req.ok == False):
      print(f"Could not reach {link} - skipping!")
      continue
    req.html.render()
    full_text = None
    with tempfile.NamedTemporaryFile(mode = "w") as tmp:
      tmp.write(req.html.html)
      tmp.seek(0)
      loader = UnstructuredHTMLLoader(tmp.name)
      data = loader.load()[0]
      full_text = data.page_content
      tmp.close()
    link = append_meta(req, full_text, True)
    if(len(full_text) > 0):
      source = urlparse(req.url)
      output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
      output_path = f"./outputs/website-logs"
      transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
      transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
      if os.path.isdir(output_path) == False:
        os.makedirs(output_path)
-      if os.path.isdir(transaction_output_dir) == False:
+# parse links from array
-        os.makedirs(transaction_output_dir)
+def parse_links(links):
    totalTokens = 0
    for link in links:
        if link.endswith(".pdf"):
            print(f"Skipping PDF file: {link}")
            continue
        print(f"Working on {link}...")
        session = HTMLSession()
        req = session.get(link, timeout=20) 
-      full_text = append_meta(req, full_text)
+        if not req.ok:
-      tokenCount = len(tokenize(full_text))
+            print(f"Could not reach {link} - skipping!")
-      link['pageContent'] = full_text
+            continue
-      link['token_count_estimate'] = tokenCount
+        
-      totalTokens += tokenCount
+        req.html.render(timeout=10)    
-      with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
+        full_text = None
-        json.dump(link, file, ensure_ascii=True, indent=4)
+        with tempfile.NamedTemporaryFile(mode="w") as tmp:
            tmp.write(req.html.html)
            tmp.seek(0)
            loader = UnstructuredHTMLLoader(tmp.name)
            data = loader.load()[0]
            full_text = data.page_content
            tmp.close()
        link = append_meta(req, full_text, True)
        if len(full_text) > 0:
            source = urlparse(req.url)
            output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
            output_path = f"./outputs/website-logs"
-      with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
+            transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
-        json.dump(link, file, ensure_ascii=True, indent=4)
+            transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
    else:
      print(f"Could not parse any meaningful data from {link}.")
      continue
-  print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
+            if not os.path.isdir(output_path):
-  print(f"////////////////////////////")
+                os.makedirs(output_path)
-  print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
+
-  print(f"////////////////////////////")
+            if not os.path.isdir(transaction_output_dir):
-  exit(0)
+                os.makedirs(transaction_output_dir)
            full_text = append_meta(req, full_text)
            tokenCount = len(tokenize(full_text))
            link['pageContent'] = full_text
            link['token_count_estimate'] = tokenCount
            totalTokens += tokenCount
            with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
                json.dump(link, file, ensure_ascii=True, indent=4)
            with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
                json.dump(link, file, ensure_ascii=True, indent=4)
            req.session.close()
        else:
            print(f"Could not parse any meaningful data from {link}.")
            continue    
    print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
    print(f"////////////////////////////")
    print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
    print(f"////////////////////////////")
--- a/collector/scripts/sitemap.py
+++ b/collector/scripts/sitemap.py
@ -0,0 +1,27 @@
 import requests
 import xml.etree.ElementTree as ET
 from scripts.link import parse_links
 def parse_sitemap(url):
    response = requests.get(url)
    root = ET.fromstring(response.content)
    urls = []
    for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
        for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
            urls.append(loc.text)
    return urls
 # Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
 def sitemap():
    sitemap_url = input("Enter the URL of the sitemap: ")
    if(len(sitemap_url) == 0):
        print("No valid sitemap provided!")
        exit(1)
    url_array = parse_sitemap(sitemap_url)
    #parse links from array
    parse_links(url_array)