import os, json, tempfile from urllib.parse import urlparse from requests_html import HTMLSession from langchain.document_loaders import UnstructuredHTMLLoader from .link_utils import append_meta from .utils import tokenize, ada_v2_cost import requests from bs4 import BeautifulSoup # Example Channel URL def link(): print("[NOTICE]: The first time running this process it will download supporting libraries.\n\n") fqdn_link = input("Paste in the URL of an online article or blog: ") if(len(fqdn_link) == 0): print("Invalid URL!") exit(1) session = HTMLSession() req = session.get(fqdn_link) if(req.ok == False): print("Could not reach this url!") exit(1) req.html.render() full_text = None with tempfile.NamedTemporaryFile(mode = "w") as tmp: tmp.write(req.html.html) loader = UnstructuredHTMLLoader( data = loader.load()[0] full_text = data.page_content tmp.close() link = append_meta(req, full_text, True) if(len(full_text) > 0): source = urlparse(req.url) output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json" output_path = f"./outputs/website-logs" transaction_output_filename = f"article-{source.path.replace('/','_')}.json" transaction_output_dir = f"../server/storage/documents/website-{source.netloc}" if os.path.isdir(output_path) == False: os.makedirs(output_path) if os.path.isdir(transaction_output_dir) == False: os.makedirs(transaction_output_dir) full_text = append_meta(req, full_text) tokenCount = len(tokenize(full_text)) link['pageContent'] = full_text link['token_count_estimate'] = tokenCount with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: json.dump(link, file, ensure_ascii=True, indent=4) with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file: json.dump(link, file, ensure_ascii=True, indent=4) else: print("Could not parse any meaningful data from this link or url.") exit(1) print(f"\n\n[Success]: article or link content fetched!") print(f"////////////////////////////") print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(tokenCount)} using {tokenCount} tokens.") print(f"////////////////////////////") exit(0) def crawler(): prompt = "Paste in root URI of the pages of interest: " new_link = input(prompt) filter_value = input("Add a filter value for the url to ensure links don't wander too far: ") #extract this from the uri provided root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname links = [] urls = new_link links.append(new_link) grab = requests.get(urls) soup = BeautifulSoup(grab.text, 'html.parser') # traverse paragraphs from soup for link in soup.find_all("a"): data = link.get('href') if (data is not None): if filter_value in data: data = data.strip() print (data) links.append(root_site + data) else: print (data + " does not apply for linking...") #parse the links found parse_links(links) def links(): links = [] prompt = "Paste in the URL of an online article or blog: " done = False while(done == False): new_link = input(prompt) if(len(new_link) == 0): done = True links = [*set(links)] continue links.append(new_link) prompt = f"\n{len(links)} links in queue. Submit an empty value when done pasting in links to execute collection.\nPaste in the next URL of an online article or blog: " if(len(links) == 0): print("No valid links provided!") exit(1) parse_links(links) # parse links from array def parse_links(links): totalTokens = 0 for link in links: print(f"Working on {link}...") session = HTMLSession() req = session.get(link, timeout=20) if not req.ok: print(f"Could not reach {link} - skipping!") continue req.html.render(timeout=10) full_text = None with tempfile.NamedTemporaryFile(mode="w") as tmp: tmp.write(req.html.html) loader = UnstructuredHTMLLoader( data = loader.load()[0] full_text = data.page_content tmp.close() link = append_meta(req, full_text, True) if len(full_text) > 0: source = urlparse(req.url) output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json" output_path = f"./outputs/website-logs" transaction_output_filename = f"article-{source.path.replace('/','_')}.json" transaction_output_dir = f"../server/storage/documents/website-{source.netloc}" if not os.path.isdir(output_path): os.makedirs(output_path) if not os.path.isdir(transaction_output_dir): os.makedirs(transaction_output_dir) full_text = append_meta(req, full_text) tokenCount = len(tokenize(full_text)) link['pageContent'] = full_text link['token_count_estimate'] = tokenCount totalTokens += tokenCount with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: json.dump(link, file, ensure_ascii=True, indent=4) with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file: json.dump(link, file, ensure_ascii=True, indent=4) req.session.close() else: print(f"Could not parse any meaningful data from {link}.") continue print(f"\n\n[Success]: {len(links)} article or link contents fetched!") print(f"////////////////////////////") print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.") print(f"////////////////////////////")