mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-04 14:00:11 +01:00
72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
|
import os, json, requests, re
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
def get_username(author_url):
|
||
|
if '@' in author_url:
|
||
|
pattern = r"medium\.com/@([\w-]+)"
|
||
|
match = re.search(pattern, author_url)
|
||
|
return match.group(1) if match else None
|
||
|
else:
|
||
|
# Given subdomain
|
||
|
pattern = r"([\w-]+).medium\.com"
|
||
|
match = re.search(pattern, author_url)
|
||
|
return match.group(1) if match else None
|
||
|
|
||
|
def get_docid(medium_docpath):
|
||
|
pattern = r"medium\.com/p/([\w-]+)"
|
||
|
match = re.search(pattern, medium_docpath)
|
||
|
return match.group(1) if match else None
|
||
|
|
||
|
def fetch_recent_publications(handle):
|
||
|
rss_link = f"https://medium.com/feed/@{handle}"
|
||
|
response = requests.get(rss_link)
|
||
|
if(response.ok == False):
|
||
|
print(f"Could not fetch RSS results for author.")
|
||
|
return []
|
||
|
|
||
|
xml = response.content
|
||
|
soup = BeautifulSoup(xml, 'xml')
|
||
|
items = soup.find_all('item')
|
||
|
publications = []
|
||
|
|
||
|
if os.path.isdir("./outputs/medium-logs") == False:
|
||
|
os.makedirs("./outputs/medium-logs")
|
||
|
|
||
|
file_path = f"./outputs/medium-logs/medium-{handle}.json"
|
||
|
|
||
|
if os.path.exists(file_path):
|
||
|
with open(file_path, "r") as file:
|
||
|
print(f"Returning cached data for Author {handle}. If you do not wish to use stored data then delete the file for this author to allow refetching.")
|
||
|
return json.load(file)
|
||
|
|
||
|
for item in items:
|
||
|
tags = []
|
||
|
for tag in item.find_all('category'): tags.append(tag.text)
|
||
|
content = BeautifulSoup(item.find('content:encoded').text, 'html.parser')
|
||
|
data = {
|
||
|
'id': get_docid(item.find('guid').text),
|
||
|
'title': item.find('title').text,
|
||
|
'url': item.find('link').text.split('?')[0],
|
||
|
'tags': ','.join(tags),
|
||
|
'published': item.find('pubDate').text,
|
||
|
'pageContent': content.get_text()
|
||
|
}
|
||
|
publications.append(data)
|
||
|
|
||
|
with open(file_path, 'w+', encoding='utf-8') as json_file:
|
||
|
json.dump(publications, json_file, ensure_ascii=True, indent=2)
|
||
|
print(f"{len(publications)} articles found for author medium.com/@{handle}. Saved to medium-logs/medium-{handle}.json")
|
||
|
|
||
|
return publications
|
||
|
|
||
|
def append_meta(publication, text):
|
||
|
meta = {
|
||
|
'url': publication.get('url'),
|
||
|
'tags': publication.get('tags'),
|
||
|
'title': publication.get('title'),
|
||
|
'createdAt': publication.get('published'),
|
||
|
'wordCount': len(text.split(' '))
|
||
|
}
|
||
|
return "Article Metadata:\n"+json.dumps(meta)+"\n\nArticle Content:\n" + text
|
||
|
|