2023-06-04 04:28:07 +02:00
import os , json , tempfile
from urllib . parse import urlparse
from requests_html import HTMLSession
from langchain . document_loaders import UnstructuredHTMLLoader
from . link_utils import append_meta
from . utils import tokenize , ada_v2_cost
2023-06-14 20:04:17 +02:00
from requests . exceptions import ReadTimeout
2023-06-04 04:28:07 +02:00
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link ( ) :
print ( " [NOTICE]: The first time running this process it will download supporting libraries. \n \n " )
fqdn_link = input ( " Paste in the URL of an online article or blog: " )
if ( len ( fqdn_link ) == 0 ) :
print ( " Invalid URL! " )
exit ( 1 )
session = HTMLSession ( )
req = session . get ( fqdn_link )
if ( req . ok == False ) :
print ( " Could not reach this url! " )
exit ( 1 )
req . html . render ( )
full_text = None
with tempfile . NamedTemporaryFile ( mode = " w " ) as tmp :
tmp . write ( req . html . html )
tmp . seek ( 0 )
loader = UnstructuredHTMLLoader ( tmp . name )
data = loader . load ( ) [ 0 ]
full_text = data . page_content
tmp . close ( )
link = append_meta ( req , full_text , True )
if ( len ( full_text ) > 0 ) :
source = urlparse ( req . url )
output_filename = f " website- { source . netloc } - { source . path . replace ( ' / ' , ' _ ' ) } .json "
output_path = f " ./outputs/website-logs "
transaction_output_filename = f " article- { source . path . replace ( ' / ' , ' _ ' ) } .json "
2023-06-13 20:26:11 +02:00
transaction_output_dir = f " ../server/storage/documents/website- { source . netloc } "
2023-06-04 04:28:07 +02:00
if os . path . isdir ( output_path ) == False :
os . makedirs ( output_path )
if os . path . isdir ( transaction_output_dir ) == False :
os . makedirs ( transaction_output_dir )
full_text = append_meta ( req , full_text )
tokenCount = len ( tokenize ( full_text ) )
link [ ' pageContent ' ] = full_text
link [ ' token_count_estimate ' ] = tokenCount
with open ( f " { output_path } / { output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
with open ( f " { transaction_output_dir } / { transaction_output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
else :
print ( " Could not parse any meaningful data from this link or url. " )
exit ( 1 )
print ( f " \n \n [Success]: article or link content fetched! " )
print ( f " //////////////////////////// " )
print ( f " Your estimated cost to embed this data using OpenAI ' s text-embedding-ada-002 model at $0.0004 / 1K tokens will cost { ada_v2_cost ( tokenCount ) } using { tokenCount } tokens. " )
print ( f " //////////////////////////// " )
exit ( 0 )
def links ( ) :
links = [ ]
prompt = " Paste in the URL of an online article or blog: "
done = False
while ( done == False ) :
new_link = input ( prompt )
if ( len ( new_link ) == 0 ) :
done = True
links = [ * set ( links ) ]
continue
links . append ( new_link )
prompt = f " \n { len ( links ) } links in queue. Submit an empty value when done pasting in links to execute collection. \n Paste in the next URL of an online article or blog: "
if ( len ( links ) == 0 ) :
print ( " No valid links provided! " )
exit ( 1 )
2023-06-14 20:04:17 +02:00
parse_links ( links )
# parse links from array
def parse_links ( links ) :
totalTokens = 0
for link in links :
if link . endswith ( " .pdf " ) :
print ( f " Skipping PDF file: { link } " )
continue
print ( f " Working on { link } ... " )
session = HTMLSession ( )
req = session . get ( link , timeout = 20 )
if not req . ok :
print ( f " Could not reach { link } - skipping! " )
continue
req . html . render ( timeout = 10 )
full_text = None
with tempfile . NamedTemporaryFile ( mode = " w " ) as tmp :
tmp . write ( req . html . html )
tmp . seek ( 0 )
loader = UnstructuredHTMLLoader ( tmp . name )
data = loader . load ( ) [ 0 ]
full_text = data . page_content
tmp . close ( )
link = append_meta ( req , full_text , True )
if len ( full_text ) > 0 :
source = urlparse ( req . url )
output_filename = f " website- { source . netloc } - { source . path . replace ( ' / ' , ' _ ' ) } .json "
output_path = f " ./outputs/website-logs "
transaction_output_filename = f " article- { source . path . replace ( ' / ' , ' _ ' ) } .json "
transaction_output_dir = f " ../server/storage/documents/website- { source . netloc } "
if not os . path . isdir ( output_path ) :
os . makedirs ( output_path )
if not os . path . isdir ( transaction_output_dir ) :
os . makedirs ( transaction_output_dir )
full_text = append_meta ( req , full_text )
tokenCount = len ( tokenize ( full_text ) )
link [ ' pageContent ' ] = full_text
link [ ' token_count_estimate ' ] = tokenCount
totalTokens + = tokenCount
with open ( f " { output_path } / { output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
with open ( f " { transaction_output_dir } / { transaction_output_filename } " , ' w ' , encoding = ' utf-8 ' ) as file :
json . dump ( link , file , ensure_ascii = True , indent = 4 )
req . session . close ( )
else :
print ( f " Could not parse any meaningful data from { link } . " )
continue
print ( f " \n \n [Success]: { len ( links ) } article or link contents fetched! " )
print ( f " //////////////////////////// " )
print ( f " Your estimated cost to embed this data using OpenAI ' s text-embedding-ada-002 model at $0.0004 / 1K tokens will cost { ada_v2_cost ( totalTokens ) } using { totalTokens } tokens. " )
print ( f " //////////////////////////// " )