Blocks images in sitemaps from being parsed. (#56)

* Adds ability to import sitemaps to include a website

* adds example sitemap url

* adds filter to bypass common image formats

* moves filetype ignoring to sitemap script
This commit is contained in:
Skid Vis 2023-06-15 01:00:03 -05:00 committed by GitHub
parent 2403806949
commit 4118c9dcf3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 7 deletions

View File

@ -4,7 +4,6 @@ from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost
from requests.exceptions import ReadTimeout
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link():
@ -92,10 +91,6 @@ def links():
def parse_links(links):
totalTokens = 0
for link in links:
if link.endswith(".pdf"):
print(f"Skipping PDF file: {link}")
continue
print(f"Working on {link}...")
session = HTMLSession()

View File

@ -1,6 +1,7 @@
import requests
import xml.etree.ElementTree as ET
from scripts.link import parse_links
import re
def parse_sitemap(url):
response = requests.get(url)
@ -9,7 +10,10 @@ def parse_sitemap(url):
urls = []
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
urls.append(loc.text)
if not has_extension_to_ignore(loc.text):
urls.append(loc.text)
else:
print(f"Skipping filetype: {loc.text}")
return urls
@ -25,3 +29,11 @@ def sitemap():
#parse links from array
parse_links(url_array)
def has_extension_to_ignore(string):
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']
pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
match = re.search(pattern, string, re.IGNORECASE)
return match is not None