From 4118c9dcf39208ab082585c37f99c7b3c68e8658 Mon Sep 17 00:00:00 2001 From: Skid Vis Date: Thu, 15 Jun 2023 01:00:03 -0500 Subject: [PATCH] Blocks images in sitemaps from being parsed. (#56) * Adds ability to import sitemaps to include a website * adds example sitemap url * adds filter to bypass common image formats * moves filetype ignoring to sitemap script --- collector/scripts/link.py | 7 +------ collector/scripts/sitemap.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/collector/scripts/link.py b/collector/scripts/link.py index 0dad18c6..8bcc02e0 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -4,7 +4,6 @@ from requests_html import HTMLSession from langchain.document_loaders import UnstructuredHTMLLoader from .link_utils import append_meta from .utils import tokenize, ada_v2_cost -from requests.exceptions import ReadTimeout # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/ def link(): @@ -91,11 +90,7 @@ def links(): # parse links from array def parse_links(links): totalTokens = 0 - for link in links: - if link.endswith(".pdf"): - print(f"Skipping PDF file: {link}") - continue - + for link in links: print(f"Working on {link}...") session = HTMLSession() diff --git a/collector/scripts/sitemap.py b/collector/scripts/sitemap.py index 3895bcef..e780bd9c 100644 --- a/collector/scripts/sitemap.py +++ b/collector/scripts/sitemap.py @@ -1,6 +1,7 @@ import requests import xml.etree.ElementTree as ET from scripts.link import parse_links +import re def parse_sitemap(url): response = requests.get(url) @@ -9,7 +10,10 @@ def parse_sitemap(url): urls = [] for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'): for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'): - urls.append(loc.text) + if not has_extension_to_ignore(loc.text): + urls.append(loc.text) + else: + print(f"Skipping filetype: {loc.text}") return urls @@ -25,3 +29,11 @@ def sitemap(): #parse links from array parse_links(url_array) + +def has_extension_to_ignore(string): + image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf'] + + pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b' + match = re.search(pattern, string, re.IGNORECASE) + + return match is not None \ No newline at end of file