mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-14 10:30:10 +01:00
9f33b3dfcb
* Updates for Linux for frontend/server * frontend/server docker * updated Dockerfile for deps related to node vectordb * updates for collector in docker * docker deps for ODT processing * ignore another collector dir * storage mount improvements; run as UID * fix pypandoc version typo * permissions fixes
71 lines
2.2 KiB
Python
71 lines
2.2 KiB
Python
import os, json
|
|
from urllib.parse import urlparse
|
|
from .utils import tokenize, ada_v2_cost
|
|
from .medium_utils import get_username, fetch_recent_publications, append_meta
|
|
from alive_progress import alive_it
|
|
|
|
# Example medium URL: https://medium.com/@yujiangtham or https://davidall.medium.com
|
|
def medium():
|
|
print("[NOTICE]: This method will only get the 10 most recent publishings.")
|
|
author_url = input("Enter the medium URL of the author you want to collect: ")
|
|
if(author_url == ''):
|
|
print("Not a valid medium.com/@author URL")
|
|
exit(1)
|
|
|
|
handle = get_username(author_url)
|
|
if(handle is None):
|
|
print("This does not appear to be a valid medium.com/@author URL")
|
|
exit(1)
|
|
|
|
publications = fetch_recent_publications(handle)
|
|
if(len(publications)==0):
|
|
print("There are no public or free publications by this creator - nothing to collect.")
|
|
exit(1)
|
|
|
|
totalTokenCount = 0
|
|
transaction_output_dir = f"../server/storage/documents/medium-{handle}"
|
|
if os.path.isdir(transaction_output_dir) == False:
|
|
os.makedirs(transaction_output_dir)
|
|
|
|
for publication in alive_it(publications):
|
|
pub_file_path = transaction_output_dir + f"/publication-{publication.get('id')}.json"
|
|
if os.path.exists(pub_file_path) == True: continue
|
|
|
|
full_text = publication.get('pageContent')
|
|
if full_text is None or len(full_text) == 0: continue
|
|
|
|
full_text = append_meta(publication, full_text)
|
|
item = {
|
|
'id': publication.get('id'),
|
|
'url': publication.get('url'),
|
|
'title': publication.get('title'),
|
|
'published': publication.get('published'),
|
|
'wordCount': len(full_text.split(' ')),
|
|
'pageContent': full_text,
|
|
}
|
|
|
|
tokenCount = len(tokenize(full_text))
|
|
item['token_count_estimate'] = tokenCount
|
|
|
|
totalTokenCount += tokenCount
|
|
with open(pub_file_path, 'w', encoding='utf-8') as file:
|
|
json.dump(item, file, ensure_ascii=True, indent=4)
|
|
|
|
print(f"[Success]: {len(publications)} scraped and fetched!")
|
|
print(f"\n\n////////////////////////////")
|
|
print(f"Your estimated cost to embed all of this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokenCount)} using {totalTokenCount} tokens.")
|
|
print(f"////////////////////////////\n\n")
|
|
exit(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|