anything-llm/collector/scripts/substack_utils.py

import os, json, requests, tempfile
from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader

def fetch_all_publications(subdomain):
  file_path = f"./outputs/substack-logs/substack-{subdomain}.json"

  if os.path.isdir("./outputs/substack-logs") == False:
    os.makedirs("./outputs/substack-logs")
  
  if os.path.exists(file_path):
    with open(file_path, "r") as file:
      print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
      return json.load(file)

  collecting = True
  offset = 0
  publications = []

  while collecting is True:
    url = f"https://{subdomain}.substack.com/api/v1/archive?sort=new&offset={offset}"
    response = requests.get(url)
    if(response.ok == False):
      print("Bad response - exiting collection")
      collecting = False
      continue
    
    data = response.json()

    if(len(data) ==0 ):
      collecting = False
      continue

    for publication in data:
      publications.append(publication)
    offset = len(publications)
  
  with open(file_path, 'w+', encoding='utf-8') as json_file:
    json.dump(publications, json_file, ensure_ascii=True, indent=2)
    print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
  
  return publications

def only_valid_publications(publications= []):
  valid_publications = []
  for publication in publications:
    is_paid = publication.get('audience') != 'everyone'
    if (is_paid and publication.get('should_send_free_preview') != True) or publication.get('type') != 'newsletter': continue
    valid_publications.append(publication)
  return valid_publications

def get_content(article_link):
  print(f"Fetching {article_link}")
  if(len(article_link) == 0):
    print("Invalid URL!")
    return None

  session = HTMLSession()
  req = session.get(article_link)
  if(req.ok == False):
    print("Could not reach this url!")
    return None
  
  req.html.render()

  full_text = None
  with tempfile.NamedTemporaryFile(mode = "w") as tmp:
    tmp.write(req.html.html)
    tmp.seek(0)
    loader = UnstructuredHTMLLoader(tmp.name)
    data = loader.load()[0]
    full_text = data.page_content
    tmp.close()
  return full_text

def append_meta(publication, text):
  meta = {
    'url': publication.get('canonical_url'),
    'thumbnail': publication.get('cover_image'),
    'title': publication.get('title'),
    'subtitle': publication.get('subtitle'),
    'description': publication.get('description'),
    'createdAt': publication.get('post_date'),
    'wordCount': publication.get('wordcount')
  }
  return "Newsletter Metadata:\n"+json.dumps(meta)+"\n\nArticle Content:\n" + text
inital commit ⚡ 2023-06-04 04:28:07 +02:00			`import os, json, requests, tempfile`
			`from requests_html import HTMLSession`
			`from langchain.document_loaders import UnstructuredHTMLLoader`

			`def fetch_all_publications(subdomain):`
			`file_path = f"./outputs/substack-logs/substack-{subdomain}.json"`

			`if os.path.isdir("./outputs/substack-logs") == False:`
			`os.makedirs("./outputs/substack-logs")`

			`if os.path.exists(file_path):`
			`with open(file_path, "r") as file:`
			`print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")`
			`return json.load(file)`

			`collecting = True`
			`offset = 0`
			`publications = []`

			`while collecting is True:`
			`url = f"https://{subdomain}.substack.com/api/v1/archive?sort=new&offset={offset}"`
			`response = requests.get(url)`
			`if(response.ok == False):`
			`print("Bad response - exiting collection")`
			`collecting = False`
			`continue`

			`data = response.json()`

			`if(len(data) ==0 ):`
			`collecting = False`
			`continue`

			`for publication in data:`
			`publications.append(publication)`
			`offset = len(publications)`

			`with open(file_path, 'w+', encoding='utf-8') as json_file:`
			`json.dump(publications, json_file, ensure_ascii=True, indent=2)`
			`print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")`

			`return publications`

			`def only_valid_publications(publications= []):`
			`valid_publications = []`
			`for publication in publications:`
			`is_paid = publication.get('audience') != 'everyone'`
			`if (is_paid and publication.get('should_send_free_preview') != True) or publication.get('type') != 'newsletter': continue`
			`valid_publications.append(publication)`
			`return valid_publications`

			`def get_content(article_link):`
			`print(f"Fetching {article_link}")`
			`if(len(article_link) == 0):`
			`print("Invalid URL!")`
			`return None`

			`session = HTMLSession()`
			`req = session.get(article_link)`
			`if(req.ok == False):`
			`print("Could not reach this url!")`
			`return None`

			`req.html.render()`

			`full_text = None`
			`with tempfile.NamedTemporaryFile(mode = "w") as tmp:`
			`tmp.write(req.html.html)`
			`tmp.seek(0)`
			`loader = UnstructuredHTMLLoader(tmp.name)`
			`data = loader.load()[0]`
			`full_text = data.page_content`
			`tmp.close()`
			`return full_text`

			`def append_meta(publication, text):`
			`meta = {`
			`'url': publication.get('canonical_url'),`
			`'thumbnail': publication.get('cover_image'),`
			`'title': publication.get('title'),`
			`'subtitle': publication.get('subtitle'),`
			`'description': publication.get('description'),`
			`'createdAt': publication.get('post_date'),`
			`'wordCount': publication.get('wordcount')`
			`}`
			`return "Newsletter Metadata:\n"+json.dumps(meta)+"\n\nArticle Content:\n" + text`