mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-04 22:10:12 +01:00
Adds ability to import sitemaps to include a website (#51)
* Adds ability to import sitemaps to include a website * adds example sitemap url
This commit is contained in:
parent
040e0d3df7
commit
bd32f97a21
@ -5,6 +5,7 @@ from scripts.link import link, links
|
||||
from scripts.substack import substack
|
||||
from scripts.medium import medium
|
||||
from scripts.gitbook import gitbook
|
||||
from scripts.sitemap import sitemap
|
||||
|
||||
def main():
|
||||
if os.name == 'nt':
|
||||
@ -13,7 +14,8 @@ def main():
|
||||
'2': 'Article or Blog Link',
|
||||
'3': 'Substack',
|
||||
'4': 'Medium',
|
||||
'5': 'Gitbook'
|
||||
'5': 'Gitbook',
|
||||
'6': 'Sitemap',
|
||||
}
|
||||
print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
|
||||
print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
|
||||
@ -29,6 +31,7 @@ def main():
|
||||
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
|
||||
{"name": "Gitbook", "value": "Gitbook"},
|
||||
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
|
||||
{"name": "Sitemap", "value": "Sitemap"},
|
||||
{"name": "Abort", "value": "Abort"},
|
||||
],
|
||||
).execute()
|
||||
@ -62,6 +65,9 @@ def main():
|
||||
if method == 'Gitbook':
|
||||
gitbook()
|
||||
exit(0)
|
||||
if method == 'Sitemap':
|
||||
sitemap()
|
||||
exit(0)
|
||||
|
||||
print("Selection was not valid.")
|
||||
exit(1)
|
||||
|
@ -4,6 +4,7 @@ from requests_html import HTMLSession
|
||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||
from .link_utils import append_meta
|
||||
from .utils import tokenize, ada_v2_cost
|
||||
from requests.exceptions import ReadTimeout
|
||||
|
||||
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
|
||||
def link():
|
||||
@ -83,57 +84,71 @@ def links():
|
||||
print("No valid links provided!")
|
||||
exit(1)
|
||||
|
||||
totalTokens = 0
|
||||
for link in links:
|
||||
print(f"Working on {link}...")
|
||||
session = HTMLSession()
|
||||
req = session.get(link)
|
||||
if(req.ok == False):
|
||||
print(f"Could not reach {link} - skipping!")
|
||||
continue
|
||||
|
||||
req.html.render()
|
||||
full_text = None
|
||||
with tempfile.NamedTemporaryFile(mode = "w") as tmp:
|
||||
tmp.write(req.html.html)
|
||||
tmp.seek(0)
|
||||
loader = UnstructuredHTMLLoader(tmp.name)
|
||||
data = loader.load()[0]
|
||||
full_text = data.page_content
|
||||
tmp.close()
|
||||
|
||||
link = append_meta(req, full_text, True)
|
||||
if(len(full_text) > 0):
|
||||
source = urlparse(req.url)
|
||||
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
|
||||
output_path = f"./outputs/website-logs"
|
||||
parse_links(links)
|
||||
|
||||
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
|
||||
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
|
||||
|
||||
if os.path.isdir(output_path) == False:
|
||||
os.makedirs(output_path)
|
||||
|
||||
if os.path.isdir(transaction_output_dir) == False:
|
||||
os.makedirs(transaction_output_dir)
|
||||
# parse links from array
|
||||
def parse_links(links):
|
||||
totalTokens = 0
|
||||
for link in links:
|
||||
if link.endswith(".pdf"):
|
||||
print(f"Skipping PDF file: {link}")
|
||||
continue
|
||||
|
||||
print(f"Working on {link}...")
|
||||
session = HTMLSession()
|
||||
|
||||
req = session.get(link, timeout=20)
|
||||
|
||||
full_text = append_meta(req, full_text)
|
||||
tokenCount = len(tokenize(full_text))
|
||||
link['pageContent'] = full_text
|
||||
link['token_count_estimate'] = tokenCount
|
||||
totalTokens += tokenCount
|
||||
if not req.ok:
|
||||
print(f"Could not reach {link} - skipping!")
|
||||
continue
|
||||
|
||||
req.html.render(timeout=10)
|
||||
|
||||
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(link, file, ensure_ascii=True, indent=4)
|
||||
full_text = None
|
||||
with tempfile.NamedTemporaryFile(mode="w") as tmp:
|
||||
tmp.write(req.html.html)
|
||||
tmp.seek(0)
|
||||
loader = UnstructuredHTMLLoader(tmp.name)
|
||||
data = loader.load()[0]
|
||||
full_text = data.page_content
|
||||
tmp.close()
|
||||
|
||||
link = append_meta(req, full_text, True)
|
||||
if len(full_text) > 0:
|
||||
source = urlparse(req.url)
|
||||
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
|
||||
output_path = f"./outputs/website-logs"
|
||||
|
||||
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(link, file, ensure_ascii=True, indent=4)
|
||||
else:
|
||||
print(f"Could not parse any meaningful data from {link}.")
|
||||
continue
|
||||
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
|
||||
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
|
||||
|
||||
print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
|
||||
print(f"////////////////////////////")
|
||||
print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
|
||||
print(f"////////////////////////////")
|
||||
exit(0)
|
||||
if not os.path.isdir(output_path):
|
||||
os.makedirs(output_path)
|
||||
|
||||
if not os.path.isdir(transaction_output_dir):
|
||||
os.makedirs(transaction_output_dir)
|
||||
|
||||
full_text = append_meta(req, full_text)
|
||||
tokenCount = len(tokenize(full_text))
|
||||
link['pageContent'] = full_text
|
||||
link['token_count_estimate'] = tokenCount
|
||||
totalTokens += tokenCount
|
||||
|
||||
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(link, file, ensure_ascii=True, indent=4)
|
||||
|
||||
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(link, file, ensure_ascii=True, indent=4)
|
||||
|
||||
req.session.close()
|
||||
else:
|
||||
print(f"Could not parse any meaningful data from {link}.")
|
||||
continue
|
||||
|
||||
print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
|
||||
print(f"////////////////////////////")
|
||||
print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
|
||||
print(f"////////////////////////////")
|
27
collector/scripts/sitemap.py
Normal file
27
collector/scripts/sitemap.py
Normal file
@ -0,0 +1,27 @@
|
||||
import requests
|
||||
import xml.etree.ElementTree as ET
|
||||
from scripts.link import parse_links
|
||||
|
||||
def parse_sitemap(url):
|
||||
response = requests.get(url)
|
||||
root = ET.fromstring(response.content)
|
||||
|
||||
urls = []
|
||||
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
|
||||
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
|
||||
urls.append(loc.text)
|
||||
|
||||
return urls
|
||||
|
||||
# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
|
||||
def sitemap():
|
||||
sitemap_url = input("Enter the URL of the sitemap: ")
|
||||
|
||||
if(len(sitemap_url) == 0):
|
||||
print("No valid sitemap provided!")
|
||||
exit(1)
|
||||
|
||||
url_array = parse_sitemap(sitemap_url)
|
||||
|
||||
#parse links from array
|
||||
parse_links(url_array)
|
Loading…
Reference in New Issue
Block a user