From a52b0ae655389270662318a85c36d4e0aa7ae320 Mon Sep 17 00:00:00 2001 From: AntonioCiolino Date: Mon, 19 Jun 2023 15:07:26 -0400 Subject: [PATCH] Updated Link scraper to avoid NoneType error. (#90) * Enable web scraping based on a urtl and a simple filter. * ignore yarn * Updated Link scraper to avoid NoneType error. --- .gitignore | 2 +- collector/scripts/link.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 0725f47c..a1d96b6e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ __pycache__ v-env .DS_Store aws_cf_deploy_anything_llm.json - +yarn.lock diff --git a/collector/scripts/link.py b/collector/scripts/link.py index 17a532cb..2bc604e9 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -80,12 +80,14 @@ def crawler(): # traverse paragraphs from soup for link in soup.find_all("a"): - data = link.get('href').strip() - if filter_value in data: - print (data) - links.append(root_site + data) - else: - print (data + " does not apply for linking...") + data = link.get('href') + if (data is not None): + if filter_value in data: + data = data.strip() + print (data) + links.append(root_site + data) + else: + print (data + " does not apply for linking...") #parse the links found parse_links(links)