be able to parse relative and FQDN links from root reliabily (#138)

2024-10-02 08:50:11 +02:00 · 2023-07-05 14:40:54 -07:00 · 2023-07-05 14:40:54 -07:00 · d7315b0e53
commit d7315b0e53
parent 032c9d27b6
1 changed files with 11 additions and 6 deletions
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@ -69,7 +69,7 @@ def link():
 def crawler():
  prompt = "Paste in root URI of the pages of interest: "
  new_link = input(prompt)
-  filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
+  filter_value = input("Add a filter value for the url to ensure links don't wander too far. eg: 'my-domain.com': ")
  #extract this from the uri provided
  root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
  links = []
@ -82,12 +82,17 @@ def crawler():
  for link in soup.find_all("a"):
    data = link.get('href')
    if (data is not None):
-      if filter_value in data:
-        data = data.strip()
+      fullpath = data if data[0] != '/' else f"{root_site}{data}"
+      try:
+        destination = urlparse(fullpath).scheme + "://" + urlparse(fullpath).hostname + (urlparse(fullpath).path if urlparse(fullpath).path is not None else '')
+        if filter_value in destination:
+          data = destination.strip()
          print (data)
-        links.append(root_site + data)
+          links.append(data)
        else:
          print (data + " does not apply for linking...")
+      except:
+        print (data + " does not apply for linking...")
  #parse the links found  
  parse_links(links)