anything-llm/collector/scripts/substack.py

import os, json
from urllib.parse import urlparse
from .utils import tokenize, ada_v2_cost
from .substack_utils import fetch_all_publications, only_valid_publications, get_content, append_meta
from alive_progress import alive_it

# Example substack URL: https://swyx.substack.com/
def substack():
  author_url = input("Enter the substack URL of the author you want to collect: ")
  if(author_url == ''):
    print("Not a valid author.substack.com URL")
    exit(1)
  
  source = urlparse(author_url)
  if('substack.com' not in source.netloc or len(source.netloc.split('.')) != 3):
    print("This does not appear to be a valid author.substack.com URL")
    exit(1)
  
  subdomain = source.netloc.split('.')[0]
  publications = fetch_all_publications(subdomain)
  valid_publications = only_valid_publications(publications)

  if(len(valid_publications)==0):
    print("There are no public or free preview newsletters by this creator - nothing to collect.")
    exit(1)

  print(f"{len(valid_publications)} of {len(publications)} publications are readable publically text posts - collecting those.")
  
  totalTokenCount = 0
  transaction_output_dir = f"../server/storage/documents/substack-{subdomain}"
  if os.path.isdir(transaction_output_dir) == False:
    os.makedirs(transaction_output_dir)

  for publication in alive_it(valid_publications):
    pub_file_path = transaction_output_dir + f"/publication-{publication.get('id')}.json"
    if os.path.exists(pub_file_path) == True: continue

    full_text = get_content(publication.get('canonical_url'))
    if full_text is None or len(full_text) == 0: continue

    full_text = append_meta(publication, full_text)
    item = {
      'id': publication.get('id'),
      'url': publication.get('canonical_url'),
      'thumbnail': publication.get('cover_image'),
      'title': publication.get('title'),
      'subtitle': publication.get('subtitle'),
      'description': publication.get('description'),
      'published': publication.get('post_date'),
      'wordCount': publication.get('wordcount'),
      'pageContent': full_text,
    }

    tokenCount = len(tokenize(full_text))
    item['token_count_estimate'] = tokenCount

    totalTokenCount += tokenCount
    with open(pub_file_path, 'w', encoding='utf-8') as file:
      json.dump(item, file, ensure_ascii=True, indent=4)

  print(f"[Success]: {len(valid_publications)} scraped and fetched!")
  print(f"\n\n////////////////////////////")
  print(f"Your estimated cost to embed all of this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokenCount)} using {totalTokenCount} tokens.")
  print(f"////////////////////////////\n\n")
  exit(0)
inital commit ⚡ 2023-06-04 04:28:07 +02:00			`import os, json`
			`from urllib.parse import urlparse`
			`from .utils import tokenize, ada_v2_cost`
			`from .substack_utils import fetch_all_publications, only_valid_publications, get_content, append_meta`
			`from alive_progress import alive_it`

			`# Example substack URL: https://swyx.substack.com/`
			`def substack():`
			`author_url = input("Enter the substack URL of the author you want to collect: ")`
			`if(author_url == ''):`
			`print("Not a valid author.substack.com URL")`
			`exit(1)`

			`source = urlparse(author_url)`
			`if('substack.com' not in source.netloc or len(source.netloc.split('.')) != 3):`
			`print("This does not appear to be a valid author.substack.com URL")`
			`exit(1)`

			`subdomain = source.netloc.split('.')[0]`
			`publications = fetch_all_publications(subdomain)`
			`valid_publications = only_valid_publications(publications)`

			`if(len(valid_publications)==0):`
			`print("There are no public or free preview newsletters by this creator - nothing to collect.")`
			`exit(1)`

			`print(f"{len(valid_publications)} of {len(publications)} publications are readable publically text posts - collecting those.")`

			`totalTokenCount = 0`
Docker support (#34) * Updates for Linux for frontend/server * frontend/server docker * updated Dockerfile for deps related to node vectordb * updates for collector in docker * docker deps for ODT processing * ignore another collector dir * storage mount improvements; run as UID * fix pypandoc version typo * permissions fixes 2023-06-13 20:26:11 +02:00			`transaction_output_dir = f"../server/storage/documents/substack-{subdomain}"`
inital commit ⚡ 2023-06-04 04:28:07 +02:00			`if os.path.isdir(transaction_output_dir) == False:`
			`os.makedirs(transaction_output_dir)`

			`for publication in alive_it(valid_publications):`
			`pub_file_path = transaction_output_dir + f"/publication-{publication.get('id')}.json"`
			`if os.path.exists(pub_file_path) == True: continue`

			`full_text = get_content(publication.get('canonical_url'))`
			`if full_text is None or len(full_text) == 0: continue`

			`full_text = append_meta(publication, full_text)`
			`item = {`
			`'id': publication.get('id'),`
			`'url': publication.get('canonical_url'),`
			`'thumbnail': publication.get('cover_image'),`
			`'title': publication.get('title'),`
			`'subtitle': publication.get('subtitle'),`
			`'description': publication.get('description'),`
			`'published': publication.get('post_date'),`
			`'wordCount': publication.get('wordcount'),`
			`'pageContent': full_text,`
			`}`

			`tokenCount = len(tokenize(full_text))`
			`item['token_count_estimate'] = tokenCount`

			`totalTokenCount += tokenCount`
			`with open(pub_file_path, 'w', encoding='utf-8') as file:`
			`json.dump(item, file, ensure_ascii=True, indent=4)`

			`print(f"[Success]: {len(valid_publications)} scraped and fetched!")`
			`print(f"\n\n////////////////////////////")`
			`print(f"Your estimated cost to embed all of this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokenCount)} using {totalTokenCount} tokens.")`
			`print(f"////////////////////////////\n\n")`
			`exit(0)`