Adds ability to import sitemaps to include a website (#51)

* Adds ability to import sitemaps to include a website

* adds example sitemap url
This commit is contained in:
Skid Vis 2023-06-14 13:04:17 -05:00 committed by GitHub
parent 040e0d3df7
commit bd32f97a21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 96 additions and 48 deletions

View File

@ -5,6 +5,7 @@ from scripts.link import link, links
from scripts.substack import substack from scripts.substack import substack
from scripts.medium import medium from scripts.medium import medium
from scripts.gitbook import gitbook from scripts.gitbook import gitbook
from scripts.sitemap import sitemap
def main(): def main():
if os.name == 'nt': if os.name == 'nt':
@ -13,7 +14,8 @@ def main():
'2': 'Article or Blog Link', '2': 'Article or Blog Link',
'3': 'Substack', '3': 'Substack',
'4': 'Medium', '4': 'Medium',
'5': 'Gitbook' '5': 'Gitbook',
'6': 'Sitemap',
} }
print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.") print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n") print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
@ -29,6 +31,7 @@ def main():
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"}, {"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
{"name": "Gitbook", "value": "Gitbook"}, {"name": "Gitbook", "value": "Gitbook"},
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"}, {"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
{"name": "Sitemap", "value": "Sitemap"},
{"name": "Abort", "value": "Abort"}, {"name": "Abort", "value": "Abort"},
], ],
).execute() ).execute()
@ -62,6 +65,9 @@ def main():
if method == 'Gitbook': if method == 'Gitbook':
gitbook() gitbook()
exit(0) exit(0)
if method == 'Sitemap':
sitemap()
exit(0)
print("Selection was not valid.") print("Selection was not valid.")
exit(1) exit(1)

View File

@ -4,6 +4,7 @@ from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost from .utils import tokenize, ada_v2_cost
from requests.exceptions import ReadTimeout
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/ # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link(): def link():
@ -83,57 +84,71 @@ def links():
print("No valid links provided!") print("No valid links provided!")
exit(1) exit(1)
totalTokens = 0 parse_links(links)
for link in links:
print(f"Working on {link}...")
session = HTMLSession()
req = session.get(link)
if(req.ok == False):
print(f"Could not reach {link} - skipping!")
continue
req.html.render()
full_text = None
with tempfile.NamedTemporaryFile(mode = "w") as tmp:
tmp.write(req.html.html)
tmp.seek(0)
loader = UnstructuredHTMLLoader(tmp.name)
data = loader.load()[0]
full_text = data.page_content
tmp.close()
link = append_meta(req, full_text, True)
if(len(full_text) > 0):
source = urlparse(req.url)
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
output_path = f"./outputs/website-logs"
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
if os.path.isdir(output_path) == False:
os.makedirs(output_path)
if os.path.isdir(transaction_output_dir) == False: # parse links from array
os.makedirs(transaction_output_dir) def parse_links(links):
totalTokens = 0
for link in links:
if link.endswith(".pdf"):
print(f"Skipping PDF file: {link}")
continue
print(f"Working on {link}...")
session = HTMLSession()
req = session.get(link, timeout=20)
full_text = append_meta(req, full_text) if not req.ok:
tokenCount = len(tokenize(full_text)) print(f"Could not reach {link} - skipping!")
link['pageContent'] = full_text continue
link['token_count_estimate'] = tokenCount
totalTokens += tokenCount req.html.render(timeout=10)
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: full_text = None
json.dump(link, file, ensure_ascii=True, indent=4) with tempfile.NamedTemporaryFile(mode="w") as tmp:
tmp.write(req.html.html)
tmp.seek(0)
loader = UnstructuredHTMLLoader(tmp.name)
data = loader.load()[0]
full_text = data.page_content
tmp.close()
link = append_meta(req, full_text, True)
if len(full_text) > 0:
source = urlparse(req.url)
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
output_path = f"./outputs/website-logs"
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file: transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
json.dump(link, file, ensure_ascii=True, indent=4) transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
else:
print(f"Could not parse any meaningful data from {link}.")
continue
print(f"\n\n[Success]: {len(links)} article or link contents fetched!") if not os.path.isdir(output_path):
print(f"////////////////////////////") os.makedirs(output_path)
print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
print(f"////////////////////////////") if not os.path.isdir(transaction_output_dir):
exit(0) os.makedirs(transaction_output_dir)
full_text = append_meta(req, full_text)
tokenCount = len(tokenize(full_text))
link['pageContent'] = full_text
link['token_count_estimate'] = tokenCount
totalTokens += tokenCount
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
json.dump(link, file, ensure_ascii=True, indent=4)
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
json.dump(link, file, ensure_ascii=True, indent=4)
req.session.close()
else:
print(f"Could not parse any meaningful data from {link}.")
continue
print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
print(f"////////////////////////////")
print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
print(f"////////////////////////////")

View File

@ -0,0 +1,27 @@
import requests
import xml.etree.ElementTree as ET
from scripts.link import parse_links
def parse_sitemap(url):
response = requests.get(url)
root = ET.fromstring(response.content)
urls = []
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
urls.append(loc.text)
return urls
# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
def sitemap():
sitemap_url = input("Enter the URL of the sitemap: ")
if(len(sitemap_url) == 0):
print("No valid sitemap provided!")
exit(1)
url_array = parse_sitemap(sitemap_url)
#parse links from array
parse_links(url_array)