From f40309cfdb1749fb8db1139759a4af576c6198a8 Mon Sep 17 00:00:00 2001 From: Sean Hatfield Date: Thu, 16 Nov 2023 14:36:26 -0800 Subject: [PATCH] Add id to all metadata to prevent errors in frontend document picker (#378) add id to all metadata to prevent errors in frontend docuemnt picker Co-authored-by: timothycarambat --- collector/scripts/substack_utils.py | 12 +++++++----- collector/scripts/twitter.py | 8 +++++--- collector/scripts/yt_utils.py | 14 ++++++++------ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/collector/scripts/substack_utils.py b/collector/scripts/substack_utils.py index c9530321..b6b5f083 100644 --- a/collector/scripts/substack_utils.py +++ b/collector/scripts/substack_utils.py @@ -1,13 +1,14 @@ import os, json, requests, tempfile from requests_html import HTMLSession from langchain.document_loaders import UnstructuredHTMLLoader +from .watch.utils import guid def fetch_all_publications(subdomain): file_path = f"./outputs/substack-logs/substack-{subdomain}.json" if os.path.isdir("./outputs/substack-logs") == False: os.makedirs("./outputs/substack-logs") - + if os.path.exists(file_path): with open(file_path, "r") as file: print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.") @@ -24,7 +25,7 @@ def fetch_all_publications(subdomain): print("Bad response - exiting collection") collecting = False continue - + data = response.json() if(len(data) ==0 ): @@ -34,11 +35,11 @@ def fetch_all_publications(subdomain): for publication in data: publications.append(publication) offset = len(publications) - + with open(file_path, 'w+', encoding='utf-8') as json_file: json.dump(publications, json_file, ensure_ascii=True, indent=2) print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json") - + return publications def only_valid_publications(publications= []): @@ -60,7 +61,7 @@ def get_content(article_link): if(req.ok == False): print("Could not reach this url!") return None - + req.html.render() full_text = None @@ -75,6 +76,7 @@ def get_content(article_link): def append_meta(publication, text): meta = { + 'id': guid(), 'url': publication.get('canonical_url'), 'thumbnail': publication.get('cover_image'), 'title': publication.get('title'), diff --git a/collector/scripts/twitter.py b/collector/scripts/twitter.py index a5c02948..4c085071 100644 --- a/collector/scripts/twitter.py +++ b/collector/scripts/twitter.py @@ -7,13 +7,14 @@ import os, time import pandas as pd import json from .utils import tokenize, ada_v2_cost +from .watch.utils import guid def twitter(): #get user and number of tweets to read username = input("user timeline to read from (blank to ignore): ") searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ") tweetCount = input("Gather the last number of tweets: ") - + # Read your API keys to call the API. consumer_key = os.environ.get("TW_CONSUMER_KEY") consumer_secret = os.environ.get("TW_CONSUMER_SECRET") @@ -43,7 +44,7 @@ def twitter(): [tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text] for tweet in tweets ] - + # Creation of column list to rename the columns in the dataframe columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"] @@ -76,7 +77,7 @@ def twitter(): with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file: json.dump(meta_link, file, ensure_ascii=True, indent=4) - + # print(f"{transaction_output_dir}/{transaction_output_filename}") print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.") @@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False): url = f"http://twitter.com/anyuser/status/{row['id']}" title = f"Tweet {row['id']}" meta = { + 'id': guid(), 'url': url, 'title': title, 'description': 'Tweet from ' + row["Screen Name"], diff --git a/collector/scripts/yt_utils.py b/collector/scripts/yt_utils.py index b4c74bbf..c13fb26e 100644 --- a/collector/scripts/yt_utils.py +++ b/collector/scripts/yt_utils.py @@ -1,6 +1,7 @@ import json, requests, os, re from slugify import slugify from dotenv import load_dotenv +from .watch.utils import guid load_dotenv() def is_yt_short(videoId): @@ -20,13 +21,13 @@ def get_channel_id(channel_link): if(response.ok == False): print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL") return None - + json_data = response.json() return json_data.get('items')[0].get('id') else: pattern = r"youtube\.com/channel/([\w-]+)" match = re.search(pattern, channel_link) - return match.group(1) if match else None + return match.group(1) if match else None def clean_text(text): @@ -34,6 +35,7 @@ def clean_text(text): def append_meta(video, duration, text): meta = { + 'id': guid(), 'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}", 'thumbnail': video.get('thumbnail'), 'description': video.get('description'), @@ -63,7 +65,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50): if(os.getenv('GOOGLE_APIS_KEY') == None): print("GOOGLE_APIS_KEY env variable not set!") exit(1) - + done = False currentPage = None pageTokens = [] @@ -93,7 +95,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50): for item in response.get('items'): if 'id' in item and 'videoId' in item.get('id'): - if is_yt_short(item.get('id').get('videoId')): + if is_yt_short(item.get('id').get('videoId')): print(f"Filtering out YT Short {item.get('id').get('videoId')}") continue @@ -109,12 +111,12 @@ def fetch_channel_video_information(channel_id, windowSize = 50): 'published': item.get('snippet').get('publishTime'), } items.append(newItem) - + pageTokens.append(currentPage) data['items'] = items with open(file_path, 'w+', encoding='utf-8') as json_file: json.dump(data, json_file, ensure_ascii=True, indent=2) print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json") - + return data \ No newline at end of file