Updated Link scraper to avoid NoneType error. (#90)

* Enable web scraping based on a urtl and a simple filter.

* ignore yarn

* Updated Link scraper to avoid NoneType error.
This commit is contained in:
AntonioCiolino 2023-06-19 15:07:26 -04:00 committed by GitHub
parent 4072369f44
commit a52b0ae655
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 7 deletions

2
.gitignore vendored
View File

@ -7,4 +7,4 @@ __pycache__
v-env
.DS_Store
aws_cf_deploy_anything_llm.json
yarn.lock

View File

@ -80,12 +80,14 @@ def crawler():
# traverse paragraphs from soup
for link in soup.find_all("a"):
data = link.get('href').strip()
if filter_value in data:
print (data)
links.append(root_site + data)
else:
print (data + " does not apply for linking...")
data = link.get('href')
if (data is not None):
if filter_value in data:
data = data.strip()
print (data)
links.append(root_site + data)
else:
print (data + " does not apply for linking...")
#parse the links found
parse_links(links)