mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 20:50:09 +01:00
Adds ability to import sitemaps to include a website (#51)
* Adds ability to import sitemaps to include a website * adds example sitemap url
This commit is contained in:
parent
040e0d3df7
commit
bd32f97a21
@ -5,6 +5,7 @@ from scripts.link import link, links
|
|||||||
from scripts.substack import substack
|
from scripts.substack import substack
|
||||||
from scripts.medium import medium
|
from scripts.medium import medium
|
||||||
from scripts.gitbook import gitbook
|
from scripts.gitbook import gitbook
|
||||||
|
from scripts.sitemap import sitemap
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
@ -13,7 +14,8 @@ def main():
|
|||||||
'2': 'Article or Blog Link',
|
'2': 'Article or Blog Link',
|
||||||
'3': 'Substack',
|
'3': 'Substack',
|
||||||
'4': 'Medium',
|
'4': 'Medium',
|
||||||
'5': 'Gitbook'
|
'5': 'Gitbook',
|
||||||
|
'6': 'Sitemap',
|
||||||
}
|
}
|
||||||
print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
|
print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
|
||||||
print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
|
print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
|
||||||
@ -29,6 +31,7 @@ def main():
|
|||||||
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
|
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
|
||||||
{"name": "Gitbook", "value": "Gitbook"},
|
{"name": "Gitbook", "value": "Gitbook"},
|
||||||
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
|
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
|
||||||
|
{"name": "Sitemap", "value": "Sitemap"},
|
||||||
{"name": "Abort", "value": "Abort"},
|
{"name": "Abort", "value": "Abort"},
|
||||||
],
|
],
|
||||||
).execute()
|
).execute()
|
||||||
@ -62,6 +65,9 @@ def main():
|
|||||||
if method == 'Gitbook':
|
if method == 'Gitbook':
|
||||||
gitbook()
|
gitbook()
|
||||||
exit(0)
|
exit(0)
|
||||||
|
if method == 'Sitemap':
|
||||||
|
sitemap()
|
||||||
|
exit(0)
|
||||||
|
|
||||||
print("Selection was not valid.")
|
print("Selection was not valid.")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
@ -4,6 +4,7 @@ from requests_html import HTMLSession
|
|||||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||||
from .link_utils import append_meta
|
from .link_utils import append_meta
|
||||||
from .utils import tokenize, ada_v2_cost
|
from .utils import tokenize, ada_v2_cost
|
||||||
|
from requests.exceptions import ReadTimeout
|
||||||
|
|
||||||
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
|
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
|
||||||
def link():
|
def link():
|
||||||
@ -83,57 +84,71 @@ def links():
|
|||||||
print("No valid links provided!")
|
print("No valid links provided!")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
totalTokens = 0
|
parse_links(links)
|
||||||
for link in links:
|
|
||||||
print(f"Working on {link}...")
|
|
||||||
session = HTMLSession()
|
|
||||||
req = session.get(link)
|
|
||||||
if(req.ok == False):
|
|
||||||
print(f"Could not reach {link} - skipping!")
|
|
||||||
continue
|
|
||||||
|
|
||||||
req.html.render()
|
|
||||||
full_text = None
|
|
||||||
with tempfile.NamedTemporaryFile(mode = "w") as tmp:
|
|
||||||
tmp.write(req.html.html)
|
|
||||||
tmp.seek(0)
|
|
||||||
loader = UnstructuredHTMLLoader(tmp.name)
|
|
||||||
data = loader.load()[0]
|
|
||||||
full_text = data.page_content
|
|
||||||
tmp.close()
|
|
||||||
|
|
||||||
link = append_meta(req, full_text, True)
|
|
||||||
if(len(full_text) > 0):
|
|
||||||
source = urlparse(req.url)
|
|
||||||
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
|
|
||||||
output_path = f"./outputs/website-logs"
|
|
||||||
|
|
||||||
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
|
|
||||||
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
|
|
||||||
|
|
||||||
if os.path.isdir(output_path) == False:
|
|
||||||
os.makedirs(output_path)
|
|
||||||
|
|
||||||
if os.path.isdir(transaction_output_dir) == False:
|
# parse links from array
|
||||||
os.makedirs(transaction_output_dir)
|
def parse_links(links):
|
||||||
|
totalTokens = 0
|
||||||
|
for link in links:
|
||||||
|
if link.endswith(".pdf"):
|
||||||
|
print(f"Skipping PDF file: {link}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Working on {link}...")
|
||||||
|
session = HTMLSession()
|
||||||
|
|
||||||
|
req = session.get(link, timeout=20)
|
||||||
|
|
||||||
full_text = append_meta(req, full_text)
|
if not req.ok:
|
||||||
tokenCount = len(tokenize(full_text))
|
print(f"Could not reach {link} - skipping!")
|
||||||
link['pageContent'] = full_text
|
continue
|
||||||
link['token_count_estimate'] = tokenCount
|
|
||||||
totalTokens += tokenCount
|
req.html.render(timeout=10)
|
||||||
|
|
||||||
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
full_text = None
|
||||||
json.dump(link, file, ensure_ascii=True, indent=4)
|
with tempfile.NamedTemporaryFile(mode="w") as tmp:
|
||||||
|
tmp.write(req.html.html)
|
||||||
|
tmp.seek(0)
|
||||||
|
loader = UnstructuredHTMLLoader(tmp.name)
|
||||||
|
data = loader.load()[0]
|
||||||
|
full_text = data.page_content
|
||||||
|
tmp.close()
|
||||||
|
|
||||||
|
link = append_meta(req, full_text, True)
|
||||||
|
if len(full_text) > 0:
|
||||||
|
source = urlparse(req.url)
|
||||||
|
output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
|
||||||
|
output_path = f"./outputs/website-logs"
|
||||||
|
|
||||||
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
|
||||||
json.dump(link, file, ensure_ascii=True, indent=4)
|
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
|
||||||
else:
|
|
||||||
print(f"Could not parse any meaningful data from {link}.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
|
if not os.path.isdir(output_path):
|
||||||
print(f"////////////////////////////")
|
os.makedirs(output_path)
|
||||||
print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
|
|
||||||
print(f"////////////////////////////")
|
if not os.path.isdir(transaction_output_dir):
|
||||||
exit(0)
|
os.makedirs(transaction_output_dir)
|
||||||
|
|
||||||
|
full_text = append_meta(req, full_text)
|
||||||
|
tokenCount = len(tokenize(full_text))
|
||||||
|
link['pageContent'] = full_text
|
||||||
|
link['token_count_estimate'] = tokenCount
|
||||||
|
totalTokens += tokenCount
|
||||||
|
|
||||||
|
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(link, file, ensure_ascii=True, indent=4)
|
||||||
|
|
||||||
|
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(link, file, ensure_ascii=True, indent=4)
|
||||||
|
|
||||||
|
req.session.close()
|
||||||
|
else:
|
||||||
|
print(f"Could not parse any meaningful data from {link}.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
|
||||||
|
print(f"////////////////////////////")
|
||||||
|
print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
|
||||||
|
print(f"////////////////////////////")
|
27
collector/scripts/sitemap.py
Normal file
27
collector/scripts/sitemap.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import requests
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from scripts.link import parse_links
|
||||||
|
|
||||||
|
def parse_sitemap(url):
|
||||||
|
response = requests.get(url)
|
||||||
|
root = ET.fromstring(response.content)
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
|
||||||
|
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
|
||||||
|
urls.append(loc.text)
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
|
||||||
|
def sitemap():
|
||||||
|
sitemap_url = input("Enter the URL of the sitemap: ")
|
||||||
|
|
||||||
|
if(len(sitemap_url) == 0):
|
||||||
|
print("No valid sitemap provided!")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
url_array = parse_sitemap(sitemap_url)
|
||||||
|
|
||||||
|
#parse links from array
|
||||||
|
parse_links(url_array)
|
Loading…
Reference in New Issue
Block a user