Updated Link scraper to avoid NoneType error. (#90)

* Enable web scraping based on a urtl and a simple filter. * ignore yarn * Updated Link scraper to avoid NoneType error.
2024-11-18 20:20:11 +01:00 · 2023-06-19 15:07:26 -04:00 · 2023-06-19 15:07:26 -04:00 · a52b0ae655
commit a52b0ae655
parent 4072369f44
2 changed files with 9 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,4 +7,4 @@ __pycache__
 v-env
 .DS_Store
 aws_cf_deploy_anything_llm.json
-
+yarn.lock
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@ -80,8 +80,10 @@ def crawler():

  # traverse paragraphs from soup
  for link in soup.find_all("a"):
-    data = link.get('href').strip()
+    data = link.get('href')
+    if (data is not None):
      if filter_value in data:
+        data = data.strip()
        print (data)
        links.append(root_site + data)
      else: