mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-05 06:20:10 +01:00
Blocks images in sitemaps from being parsed. (#56)
* Adds ability to import sitemaps to include a website * adds example sitemap url * adds filter to bypass common image formats * moves filetype ignoring to sitemap script
This commit is contained in:
parent
2403806949
commit
4118c9dcf3
@ -4,7 +4,6 @@ from requests_html import HTMLSession
|
||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||
from .link_utils import append_meta
|
||||
from .utils import tokenize, ada_v2_cost
|
||||
from requests.exceptions import ReadTimeout
|
||||
|
||||
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
|
||||
def link():
|
||||
@ -92,10 +91,6 @@ def links():
|
||||
def parse_links(links):
|
||||
totalTokens = 0
|
||||
for link in links:
|
||||
if link.endswith(".pdf"):
|
||||
print(f"Skipping PDF file: {link}")
|
||||
continue
|
||||
|
||||
print(f"Working on {link}...")
|
||||
session = HTMLSession()
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
import requests
|
||||
import xml.etree.ElementTree as ET
|
||||
from scripts.link import parse_links
|
||||
import re
|
||||
|
||||
def parse_sitemap(url):
|
||||
response = requests.get(url)
|
||||
@ -9,7 +10,10 @@ def parse_sitemap(url):
|
||||
urls = []
|
||||
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
|
||||
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
|
||||
urls.append(loc.text)
|
||||
if not has_extension_to_ignore(loc.text):
|
||||
urls.append(loc.text)
|
||||
else:
|
||||
print(f"Skipping filetype: {loc.text}")
|
||||
|
||||
return urls
|
||||
|
||||
@ -25,3 +29,11 @@ def sitemap():
|
||||
|
||||
#parse links from array
|
||||
parse_links(url_array)
|
||||
|
||||
def has_extension_to_ignore(string):
|
||||
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']
|
||||
|
||||
pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
|
||||
match = re.search(pattern, string, re.IGNORECASE)
|
||||
|
||||
return match is not None
|
Loading…
Reference in New Issue
Block a user