2023-06-04 04:28:07 +02:00
import os , json , requests , tempfile
from requests_html import HTMLSession
from langchain . document_loaders import UnstructuredHTMLLoader
2023-11-16 23:36:26 +01:00
from . watch . utils import guid
2023-06-04 04:28:07 +02:00
def fetch_all_publications ( subdomain ) :
file_path = f " ./outputs/substack-logs/substack- { subdomain } .json "
if os . path . isdir ( " ./outputs/substack-logs " ) == False :
os . makedirs ( " ./outputs/substack-logs " )
2023-11-16 23:36:26 +01:00
2023-06-04 04:28:07 +02:00
if os . path . exists ( file_path ) :
with open ( file_path , " r " ) as file :
print ( f " Returning cached data for substack { subdomain } .substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching. " )
return json . load ( file )
collecting = True
offset = 0
publications = [ ]
while collecting is True :
url = f " https:// { subdomain } .substack.com/api/v1/archive?sort=new&offset= { offset } "
response = requests . get ( url )
if ( response . ok == False ) :
print ( " Bad response - exiting collection " )
collecting = False
continue
2023-11-16 23:36:26 +01:00
2023-06-04 04:28:07 +02:00
data = response . json ( )
if ( len ( data ) == 0 ) :
collecting = False
continue
for publication in data :
publications . append ( publication )
offset = len ( publications )
2023-11-16 23:36:26 +01:00
2023-06-04 04:28:07 +02:00
with open ( file_path , ' w+ ' , encoding = ' utf-8 ' ) as json_file :
json . dump ( publications , json_file , ensure_ascii = True , indent = 2 )
print ( f " { len ( publications ) } publications found for author { subdomain } .substack.com. Saved to substack-logs/channel- { subdomain } .json " )
2023-11-16 23:36:26 +01:00
2023-06-04 04:28:07 +02:00
return publications
def only_valid_publications ( publications = [ ] ) :
valid_publications = [ ]
for publication in publications :
is_paid = publication . get ( ' audience ' ) != ' everyone '
if ( is_paid and publication . get ( ' should_send_free_preview ' ) != True ) or publication . get ( ' type ' ) != ' newsletter ' : continue
valid_publications . append ( publication )
return valid_publications
def get_content ( article_link ) :
print ( f " Fetching { article_link } " )
if ( len ( article_link ) == 0 ) :
print ( " Invalid URL! " )
return None
session = HTMLSession ( )
req = session . get ( article_link )
if ( req . ok == False ) :
print ( " Could not reach this url! " )
return None
2023-11-16 23:36:26 +01:00
2023-06-04 04:28:07 +02:00
req . html . render ( )
full_text = None
with tempfile . NamedTemporaryFile ( mode = " w " ) as tmp :
tmp . write ( req . html . html )
tmp . seek ( 0 )
loader = UnstructuredHTMLLoader ( tmp . name )
data = loader . load ( ) [ 0 ]
full_text = data . page_content
tmp . close ( )
return full_text
def append_meta ( publication , text ) :
meta = {
2023-11-16 23:36:26 +01:00
' id ' : guid ( ) ,
2023-06-04 04:28:07 +02:00
' url ' : publication . get ( ' canonical_url ' ) ,
' thumbnail ' : publication . get ( ' cover_image ' ) ,
' title ' : publication . get ( ' title ' ) ,
' subtitle ' : publication . get ( ' subtitle ' ) ,
' description ' : publication . get ( ' description ' ) ,
' createdAt ' : publication . get ( ' post_date ' ) ,
' wordCount ' : publication . get ( ' wordcount ' )
}
return " Newsletter Metadata: \n " + json . dumps ( meta ) + " \n \n Article Content: \n " + text