2023-06-04 04:28:07 +02:00
import os , json , tempfile
from urllib . parse import urlparse
from requests_html import HTMLSession
from langchain . document_loaders import UnstructuredHTMLLoader
2023-11-17 02:15:01 +01:00
from . link_utils import append_meta , AsyncHTMLSessionFixed
2023-06-04 04:28:07 +02:00
from . utils import tokenize , ada_v2_cost
2023-06-17 02:29:11 +02:00
import requests
from bs4 import BeautifulSoup
2023-11-17 02:15:01 +01:00
2023-06-04 04:28:07 +02:00
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link ( ) :
2023-11-16 19:25:23 +01:00
totalTokens = 0
2023-06-04 04:28:07 +02:00
print ( " [NOTICE]: The first time running this process it will download supporting libraries. \n \n " )
fqdn_link = input ( " Paste in the URL of an online article or blog: " )
if ( len ( fqdn_link ) == 0 ) :
print ( " Invalid URL! " )
exit ( 1 )
session = HTMLSession ( )
req = session . get ( fqdn_link )
if ( req . ok == False ) :
print ( " Could not reach this url! " )
exit ( 1 )
2023-11-17 02:15:01 +01:00
2023-06-04 04:28:07 +02:00
req . html . render ( )
full_text = None
with tempfile . NamedTemporaryFile ( mode = " w " ) as tmp :
tmp . write ( req . html . html )
tmp . seek ( 0 )
loader = UnstructuredHTMLLoader ( tmp . name )
data = loader . load ( ) [ 0 ]
full_text = data . page_content
tmp . close ( )
2023-11-17 02:15:01 +01:00
2023-06-04 04:28:07 +02:00
link = append_meta ( req , full_text , True )
if ( len ( full_text ) > 0 ) :
2023-11-16 19:25:23 +01:00
totalTokens + = len ( tokenize ( full_text ) )
2023-06-04 04:28:07 +02:00
source = urlparse ( req . url )
output_filename = f " website- { source . netloc } - { source . path . replace ( ' / ' , ' _ ' ) } .json "
output_path = f " ./outputs/website-logs "
2023-11-17 02:15:01 +01:00
transaction_output_filename = f " website- { source . path . replace ( ' / ' , ' _ ' ) } .json "
transaction_output_dir = f " ../server/storage/documents/custom-documents "
2023-06-04 04:28:07 +02:00
if os . path . isdir ( output_path ) == False :
os . makedirs ( output_path )
if os . path . isdir ( transaction_output_dir ) == False :
os . makedirs ( transaction_output_dir )
full_text = append_meta ( req , full_text )
with open ( f " { output_path } / { output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
with open ( f " { transaction_output_dir } / { transaction_output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
else :
print ( " Could not parse any meaningful data from this link or url. " )
exit ( 1 )
print ( f " \n \n [Success]: article or link content fetched! " )
print ( f " //////////////////////////// " )
2023-11-16 19:25:23 +01:00
print ( f " Your estimated cost to embed this data using OpenAI ' s text-embedding-ada-002 model at $0.0004 / 1K tokens will cost { ada_v2_cost ( totalTokens ) } using { totalTokens } tokens. " )
2023-06-04 04:28:07 +02:00
print ( f " //////////////////////////// " )
exit ( 0 )
2023-11-17 02:15:01 +01:00
async def process_single_link ( url ) :
session = None
try :
print ( f " Working on { url } ... " )
session = AsyncHTMLSessionFixed ( )
req = await session . get ( url )
await req . html . arender ( )
await session . close ( )
if not req . ok :
return False , " Could not reach this URL. "
full_text = None
with tempfile . NamedTemporaryFile ( mode = " w " ) as tmp :
tmp . write ( req . html . html )
tmp . seek ( 0 )
loader = UnstructuredHTMLLoader ( tmp . name )
data = loader . load ( ) [ 0 ]
full_text = data . page_content
print ( " full text 1: " , full_text )
tmp . close ( )
print ( full_text )
print ( " full text: " , full_text )
if full_text :
link_meta = append_meta ( req , full_text , True )
source = urlparse ( req . url )
transaction_output_dir = " ../server/storage/documents/custom-documents "
transaction_output_filename = f " website- { source . netloc } - { source . path . replace ( ' / ' , ' _ ' ) } .json "
if not os . path . isdir ( transaction_output_dir ) :
os . makedirs ( transaction_output_dir )
file_path = os . path . join ( transaction_output_dir , transaction_output_filename )
with open ( file_path , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link_meta , file , ensure_ascii = False , indent = 4 )
return True , " Content fetched and saved. "
else :
return False , " Could not parse any meaningful data from this URL. "
except Exception as e :
if session is not None :
session . close ( ) # Kill hanging session.
return False , str ( e )
2023-06-17 02:29:11 +02:00
def crawler ( ) :
prompt = " Paste in root URI of the pages of interest: "
new_link = input ( prompt )
2023-07-05 23:40:54 +02:00
filter_value = input ( " Add a filter value for the url to ensure links don ' t wander too far. eg: ' my-domain.com ' : " )
2023-06-17 02:29:11 +02:00
#extract this from the uri provided
root_site = urlparse ( new_link ) . scheme + " :// " + urlparse ( new_link ) . hostname
links = [ ]
urls = new_link
links . append ( new_link )
grab = requests . get ( urls )
soup = BeautifulSoup ( grab . text , ' html.parser ' )
# traverse paragraphs from soup
for link in soup . find_all ( " a " ) :
2023-06-19 21:07:26 +02:00
data = link . get ( ' href ' )
if ( data is not None ) :
2023-07-05 23:40:54 +02:00
fullpath = data if data [ 0 ] != ' / ' else f " { root_site } { data } "
try :
destination = urlparse ( fullpath ) . scheme + " :// " + urlparse ( fullpath ) . hostname + ( urlparse ( fullpath ) . path if urlparse ( fullpath ) . path is not None else ' ' )
if filter_value in destination :
data = destination . strip ( )
print ( data )
links . append ( data )
else :
print ( data + " does not apply for linking... " )
except :
2023-06-19 21:07:26 +02:00
print ( data + " does not apply for linking... " )
2023-11-17 02:15:01 +01:00
#parse the links found
2023-06-17 02:29:11 +02:00
parse_links ( links )
2023-06-04 04:28:07 +02:00
def links ( ) :
links = [ ]
prompt = " Paste in the URL of an online article or blog: "
done = False
2023-11-17 02:15:01 +01:00
2023-06-04 04:28:07 +02:00
while ( done == False ) :
new_link = input ( prompt )
2023-11-17 02:15:01 +01:00
if ( len ( new_link ) == 0 ) :
2023-06-04 04:28:07 +02:00
done = True
links = [ * set ( links ) ]
continue
links . append ( new_link )
prompt = f " \n { len ( links ) } links in queue. Submit an empty value when done pasting in links to execute collection. \n Paste in the next URL of an online article or blog: "
if ( len ( links ) == 0 ) :
print ( " No valid links provided! " )
exit ( 1 )
2023-06-14 20:04:17 +02:00
parse_links ( links )
# parse links from array
def parse_links ( links ) :
totalTokens = 0
2023-11-17 02:15:01 +01:00
for link in links :
2023-06-14 20:04:17 +02:00
print ( f " Working on { link } ... " )
session = HTMLSession ( )
2023-11-17 02:15:01 +01:00
req = session . get ( link , timeout = 20 )
2023-06-14 20:04:17 +02:00
if not req . ok :
print ( f " Could not reach { link } - skipping! " )
continue
2023-11-17 02:15:01 +01:00
req . html . render ( timeout = 10 )
2023-06-14 20:04:17 +02:00
full_text = None
with tempfile . NamedTemporaryFile ( mode = " w " ) as tmp :
tmp . write ( req . html . html )
tmp . seek ( 0 )
loader = UnstructuredHTMLLoader ( tmp . name )
data = loader . load ( ) [ 0 ]
full_text = data . page_content
tmp . close ( )
2023-11-17 02:15:01 +01:00
2023-06-14 20:04:17 +02:00
link = append_meta ( req , full_text , True )
if len ( full_text ) > 0 :
source = urlparse ( req . url )
output_filename = f " website- { source . netloc } - { source . path . replace ( ' / ' , ' _ ' ) } .json "
output_path = f " ./outputs/website-logs "
2023-11-17 02:15:01 +01:00
transaction_output_filename = f " website- { source . path . replace ( ' / ' , ' _ ' ) } .json "
transaction_output_dir = f " ../server/storage/documents/custom-documents "
2023-06-14 20:04:17 +02:00
if not os . path . isdir ( output_path ) :
os . makedirs ( output_path )
if not os . path . isdir ( transaction_output_dir ) :
os . makedirs ( transaction_output_dir )
full_text = append_meta ( req , full_text )
tokenCount = len ( tokenize ( full_text ) )
totalTokens + = tokenCount
with open ( f " { output_path } / { output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
with open ( f " { transaction_output_dir } / { transaction_output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
req . session . close ( )
else :
print ( f " Could not parse any meaningful data from { link } . " )
2023-11-17 02:15:01 +01:00
continue
2023-06-14 20:04:17 +02:00
print ( f " \n \n [Success]: { len ( links ) } article or link contents fetched! " )
print ( f " //////////////////////////// " )
print ( f " Your estimated cost to embed this data using OpenAI ' s text-embedding-ada-002 model at $0.0004 / 1K tokens will cost { ada_v2_cost ( totalTokens ) } using { totalTokens } tokens. " )
print ( f " //////////////////////////// " )