Updated Link scraper to avoid NoneType error. (#90)

* Enable web scraping based on a urtl and a simple filter.

* ignore yarn

* Updated Link scraper to avoid NoneType error.
This commit is contained in:
AntonioCiolino 2023-06-19 15:07:26 -04:00 committed by GitHub
parent 4072369f44
commit a52b0ae655
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 7 deletions

2
.gitignore vendored
View File

@ -7,4 +7,4 @@ __pycache__
v-env v-env
.DS_Store .DS_Store
aws_cf_deploy_anything_llm.json aws_cf_deploy_anything_llm.json
yarn.lock

View File

@ -80,8 +80,10 @@ def crawler():
# traverse paragraphs from soup # traverse paragraphs from soup
for link in soup.find_all("a"): for link in soup.find_all("a"):
data = link.get('href').strip() data = link.get('href')
if (data is not None):
if filter_value in data: if filter_value in data:
data = data.strip()
print (data) print (data)
links.append(root_site + data) links.append(root_site + data)
else: else: