mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-18 20:20:11 +01:00
Updated Link scraper to avoid NoneType error. (#90)
* Enable web scraping based on a urtl and a simple filter. * ignore yarn * Updated Link scraper to avoid NoneType error.
This commit is contained in:
parent
4072369f44
commit
a52b0ae655
2
.gitignore
vendored
2
.gitignore
vendored
@ -7,4 +7,4 @@ __pycache__
|
|||||||
v-env
|
v-env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
aws_cf_deploy_anything_llm.json
|
aws_cf_deploy_anything_llm.json
|
||||||
|
yarn.lock
|
||||||
|
@ -80,8 +80,10 @@ def crawler():
|
|||||||
|
|
||||||
# traverse paragraphs from soup
|
# traverse paragraphs from soup
|
||||||
for link in soup.find_all("a"):
|
for link in soup.find_all("a"):
|
||||||
data = link.get('href').strip()
|
data = link.get('href')
|
||||||
|
if (data is not None):
|
||||||
if filter_value in data:
|
if filter_value in data:
|
||||||
|
data = data.strip()
|
||||||
print (data)
|
print (data)
|
||||||
links.append(root_site + data)
|
links.append(root_site + data)
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user