mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-04 22:10:12 +01:00
Updated Link scraper to avoid NoneType error. (#90)
* Enable web scraping based on a urtl and a simple filter. * ignore yarn * Updated Link scraper to avoid NoneType error.
This commit is contained in:
parent
4072369f44
commit
a52b0ae655
2
.gitignore
vendored
2
.gitignore
vendored
@ -7,4 +7,4 @@ __pycache__
|
||||
v-env
|
||||
.DS_Store
|
||||
aws_cf_deploy_anything_llm.json
|
||||
|
||||
yarn.lock
|
||||
|
@ -80,12 +80,14 @@ def crawler():
|
||||
|
||||
# traverse paragraphs from soup
|
||||
for link in soup.find_all("a"):
|
||||
data = link.get('href').strip()
|
||||
if filter_value in data:
|
||||
print (data)
|
||||
links.append(root_site + data)
|
||||
else:
|
||||
print (data + " does not apply for linking...")
|
||||
data = link.get('href')
|
||||
if (data is not None):
|
||||
if filter_value in data:
|
||||
data = data.strip()
|
||||
print (data)
|
||||
links.append(root_site + data)
|
||||
else:
|
||||
print (data + " does not apply for linking...")
|
||||
#parse the links found
|
||||
parse_links(links)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user