anything-llm/collector/scripts/medium_utils.py

72 lines
2.3 KiB
Python
Raw Normal View History

2023-06-04 04:28:07 +02:00
import os, json, requests, re
from bs4 import BeautifulSoup
def get_username(author_url):
if '@' in author_url:
pattern = r"medium\.com/@([\w-]+)"
match = re.search(pattern, author_url)
return match.group(1) if match else None
else:
# Given subdomain
pattern = r"([\w-]+).medium\.com"
match = re.search(pattern, author_url)
return match.group(1) if match else None
def get_docid(medium_docpath):
pattern = r"medium\.com/p/([\w-]+)"
match = re.search(pattern, medium_docpath)
return match.group(1) if match else None
def fetch_recent_publications(handle):
rss_link = f"https://medium.com/feed/@{handle}"
response = requests.get(rss_link)
if(response.ok == False):
print(f"Could not fetch RSS results for author.")
return []
xml = response.content
soup = BeautifulSoup(xml, 'xml')
items = soup.find_all('item')
publications = []
if os.path.isdir("./outputs/medium-logs") == False:
os.makedirs("./outputs/medium-logs")
file_path = f"./outputs/medium-logs/medium-{handle}.json"
if os.path.exists(file_path):
with open(file_path, "r") as file:
print(f"Returning cached data for Author {handle}. If you do not wish to use stored data then delete the file for this author to allow refetching.")
return json.load(file)
for item in items:
tags = []
for tag in item.find_all('category'): tags.append(tag.text)
content = BeautifulSoup(item.find('content:encoded').text, 'html.parser')
data = {
'id': get_docid(item.find('guid').text),
'title': item.find('title').text,
'url': item.find('link').text.split('?')[0],
'tags': ','.join(tags),
'published': item.find('pubDate').text,
'pageContent': content.get_text()
}
publications.append(data)
with open(file_path, 'w+', encoding='utf-8') as json_file:
json.dump(publications, json_file, ensure_ascii=True, indent=2)
print(f"{len(publications)} articles found for author medium.com/@{handle}. Saved to medium-logs/medium-{handle}.json")
return publications
def append_meta(publication, text):
meta = {
'url': publication.get('url'),
'tags': publication.get('tags'),
'title': publication.get('title'),
'createdAt': publication.get('published'),
'wordCount': len(text.split(' '))
}
return "Article Metadata:\n"+json.dumps(meta)+"\n\nArticle Content:\n" + text