anything-llm/collector/scripts/medium_utils.py

import os, json, requests, re
from bs4 import BeautifulSoup

def get_username(author_url):
  if '@' in author_url:
    pattern = r"medium\.com/@([\w-]+)"
    match = re.search(pattern, author_url)
    return match.group(1) if match else None 
  else:
    # Given subdomain
    pattern = r"([\w-]+).medium\.com"
    match = re.search(pattern, author_url)
    return match.group(1) if match else None 

def get_docid(medium_docpath):
  pattern = r"medium\.com/p/([\w-]+)"
  match = re.search(pattern, medium_docpath)
  return match.group(1) if match else None 

def fetch_recent_publications(handle):
  rss_link = f"https://medium.com/feed/@{handle}"
  response = requests.get(rss_link)
  if(response.ok == False):
    print(f"Could not fetch RSS results for author.")
    return []
  
  xml = response.content
  soup = BeautifulSoup(xml, 'xml')
  items = soup.find_all('item')
  publications = []

  if os.path.isdir("./outputs/medium-logs") == False:
      os.makedirs("./outputs/medium-logs")

  file_path = f"./outputs/medium-logs/medium-{handle}.json"

  if os.path.exists(file_path):
    with open(file_path, "r") as file:
      print(f"Returning cached data for Author {handle}. If you do not wish to use stored data then delete the file for this author to allow refetching.")
      return json.load(file)

  for item in items:
    tags = []
    for tag in item.find_all('category'): tags.append(tag.text)
    content = BeautifulSoup(item.find('content:encoded').text, 'html.parser')
    data = {
      'id': get_docid(item.find('guid').text),
      'title': item.find('title').text,
      'url': item.find('link').text.split('?')[0],
      'tags': ','.join(tags),
      'published': item.find('pubDate').text,
      'pageContent': content.get_text()
    }
    publications.append(data)
  
  with open(file_path, 'w+', encoding='utf-8') as json_file:
    json.dump(publications, json_file, ensure_ascii=True, indent=2)
    print(f"{len(publications)} articles found for author medium.com/@{handle}. Saved to medium-logs/medium-{handle}.json")

  return publications

def append_meta(publication, text):
  meta = {
    'url': publication.get('url'),
    'tags': publication.get('tags'),
    'title': publication.get('title'),
    'createdAt': publication.get('published'),
    'wordCount': len(text.split(' '))
  }
  return "Article Metadata:\n"+json.dumps(meta)+"\n\nArticle Content:\n" + text
inital commit ⚡ 2023-06-04 04:28:07 +02:00			`import os, json, requests, re`
			`from bs4 import BeautifulSoup`

			`def get_username(author_url):`
			`if '@' in author_url:`
			`pattern = r"medium\.com/@([\w-]+)"`
			`match = re.search(pattern, author_url)`
			`return match.group(1) if match else None`
			`else:`
			`# Given subdomain`
			`pattern = r"([\w-]+).medium\.com"`
			`match = re.search(pattern, author_url)`
			`return match.group(1) if match else None`

			`def get_docid(medium_docpath):`
			`pattern = r"medium\.com/p/([\w-]+)"`
			`match = re.search(pattern, medium_docpath)`
			`return match.group(1) if match else None`

			`def fetch_recent_publications(handle):`
			`rss_link = f"https://medium.com/feed/@{handle}"`
			`response = requests.get(rss_link)`
			`if(response.ok == False):`
			`print(f"Could not fetch RSS results for author.")`
			`return []`

			`xml = response.content`
			`soup = BeautifulSoup(xml, 'xml')`
			`items = soup.find_all('item')`
			`publications = []`

			`if os.path.isdir("./outputs/medium-logs") == False:`
			`os.makedirs("./outputs/medium-logs")`

			`file_path = f"./outputs/medium-logs/medium-{handle}.json"`

			`if os.path.exists(file_path):`
			`with open(file_path, "r") as file:`
			`print(f"Returning cached data for Author {handle}. If you do not wish to use stored data then delete the file for this author to allow refetching.")`
			`return json.load(file)`

			`for item in items:`
			`tags = []`
			`for tag in item.find_all('category'): tags.append(tag.text)`
			`content = BeautifulSoup(item.find('content:encoded').text, 'html.parser')`
			`data = {`
			`'id': get_docid(item.find('guid').text),`
			`'title': item.find('title').text,`
			`'url': item.find('link').text.split('?')[0],`
			`'tags': ','.join(tags),`
			`'published': item.find('pubDate').text,`
			`'pageContent': content.get_text()`
			`}`
			`publications.append(data)`

			`with open(file_path, 'w+', encoding='utf-8') as json_file:`
			`json.dump(publications, json_file, ensure_ascii=True, indent=2)`
			`print(f"{len(publications)} articles found for author medium.com/@{handle}. Saved to medium-logs/medium-{handle}.json")`

			`return publications`

			`def append_meta(publication, text):`
			`meta = {`
			`'url': publication.get('url'),`
			`'tags': publication.get('tags'),`
			`'title': publication.get('title'),`
			`'createdAt': publication.get('published'),`
			`'wordCount': len(text.split(' '))`
			`}`
			`return "Article Metadata:\n"+json.dumps(meta)+"\n\nArticle Content:\n" + text`