anything-llm/collector/scripts/substack.py
frasergr 9f33b3dfcb
Docker support (#34)
* Updates for Linux for frontend/server

* frontend/server docker

* updated Dockerfile for deps related to node vectordb

* updates for collector in docker

* docker deps for ODT processing

* ignore another collector dir

* storage mount improvements; run as UID

* fix pypandoc version typo

* permissions fixes
2023-06-13 11:26:11 -07:00

78 lines
2.7 KiB
Python

import os, json
from urllib.parse import urlparse
from .utils import tokenize, ada_v2_cost
from .substack_utils import fetch_all_publications, only_valid_publications, get_content, append_meta
from alive_progress import alive_it
# Example substack URL: https://swyx.substack.com/
def substack():
author_url = input("Enter the substack URL of the author you want to collect: ")
if(author_url == ''):
print("Not a valid author.substack.com URL")
exit(1)
source = urlparse(author_url)
if('substack.com' not in source.netloc or len(source.netloc.split('.')) != 3):
print("This does not appear to be a valid author.substack.com URL")
exit(1)
subdomain = source.netloc.split('.')[0]
publications = fetch_all_publications(subdomain)
valid_publications = only_valid_publications(publications)
if(len(valid_publications)==0):
print("There are no public or free preview newsletters by this creator - nothing to collect.")
exit(1)
print(f"{len(valid_publications)} of {len(publications)} publications are readable publically text posts - collecting those.")
totalTokenCount = 0
transaction_output_dir = f"../server/storage/documents/substack-{subdomain}"
if os.path.isdir(transaction_output_dir) == False:
os.makedirs(transaction_output_dir)
for publication in alive_it(valid_publications):
pub_file_path = transaction_output_dir + f"/publication-{publication.get('id')}.json"
if os.path.exists(pub_file_path) == True: continue
full_text = get_content(publication.get('canonical_url'))
if full_text is None or len(full_text) == 0: continue
full_text = append_meta(publication, full_text)
item = {
'id': publication.get('id'),
'url': publication.get('canonical_url'),
'thumbnail': publication.get('cover_image'),
'title': publication.get('title'),
'subtitle': publication.get('subtitle'),
'description': publication.get('description'),
'published': publication.get('post_date'),
'wordCount': publication.get('wordcount'),
'pageContent': full_text,
}
tokenCount = len(tokenize(full_text))
item['token_count_estimate'] = tokenCount
totalTokenCount += tokenCount
with open(pub_file_path, 'w', encoding='utf-8') as file:
json.dump(item, file, ensure_ascii=True, indent=4)
print(f"[Success]: {len(valid_publications)} scraped and fetched!")
print(f"\n\n////////////////////////////")
print(f"Your estimated cost to embed all of this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokenCount)} using {totalTokenCount} tokens.")
print(f"////////////////////////////\n\n")
exit(0)