mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-19 20:50:09 +01:00
be able to parse relative and FQDN links from root reliabily (#138)
This commit is contained in:
parent
032c9d27b6
commit
d7315b0e53
@ -69,7 +69,7 @@ def link():
|
|||||||
def crawler():
|
def crawler():
|
||||||
prompt = "Paste in root URI of the pages of interest: "
|
prompt = "Paste in root URI of the pages of interest: "
|
||||||
new_link = input(prompt)
|
new_link = input(prompt)
|
||||||
filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
|
filter_value = input("Add a filter value for the url to ensure links don't wander too far. eg: 'my-domain.com': ")
|
||||||
#extract this from the uri provided
|
#extract this from the uri provided
|
||||||
root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
|
root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
|
||||||
links = []
|
links = []
|
||||||
@ -82,11 +82,16 @@ def crawler():
|
|||||||
for link in soup.find_all("a"):
|
for link in soup.find_all("a"):
|
||||||
data = link.get('href')
|
data = link.get('href')
|
||||||
if (data is not None):
|
if (data is not None):
|
||||||
if filter_value in data:
|
fullpath = data if data[0] != '/' else f"{root_site}{data}"
|
||||||
data = data.strip()
|
try:
|
||||||
print (data)
|
destination = urlparse(fullpath).scheme + "://" + urlparse(fullpath).hostname + (urlparse(fullpath).path if urlparse(fullpath).path is not None else '')
|
||||||
links.append(root_site + data)
|
if filter_value in destination:
|
||||||
else:
|
data = destination.strip()
|
||||||
|
print (data)
|
||||||
|
links.append(data)
|
||||||
|
else:
|
||||||
|
print (data + " does not apply for linking...")
|
||||||
|
except:
|
||||||
print (data + " does not apply for linking...")
|
print (data + " does not apply for linking...")
|
||||||
#parse the links found
|
#parse the links found
|
||||||
parse_links(links)
|
parse_links(links)
|
||||||
|
Loading…
Reference in New Issue
Block a user