From f40309cfdb1749fb8db1139759a4af576c6198a8 Mon Sep 17 00:00:00 2001
From: Sean Hatfield <seanhatfield5@gmail.com>
Date: Thu, 16 Nov 2023 14:36:26 -0800
Subject: [PATCH] Add id to all metadata to prevent errors in frontend document
 picker (#378)

add id to all metadata to prevent errors in frontend docuemnt picker

Co-authored-by: timothycarambat <rambat1010@gmail.com>
---
 collector/scripts/substack_utils.py | 12 +++++++-----
 collector/scripts/twitter.py        |  8 +++++---
 collector/scripts/yt_utils.py       | 14 ++++++++------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/collector/scripts/substack_utils.py b/collector/scripts/substack_utils.py
index c9530321..b6b5f083 100644
--- a/collector/scripts/substack_utils.py
+++ b/collector/scripts/substack_utils.py
@@ -1,13 +1,14 @@
 import os, json, requests, tempfile
 from requests_html import HTMLSession
 from langchain.document_loaders import UnstructuredHTMLLoader
+from .watch.utils import guid
 
 def fetch_all_publications(subdomain):
   file_path = f"./outputs/substack-logs/substack-{subdomain}.json"
 
   if os.path.isdir("./outputs/substack-logs") == False:
     os.makedirs("./outputs/substack-logs")
-  
+
   if os.path.exists(file_path):
     with open(file_path, "r") as file:
       print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
@@ -24,7 +25,7 @@ def fetch_all_publications(subdomain):
       print("Bad response - exiting collection")
       collecting = False
       continue
-    
+
     data = response.json()
 
     if(len(data) ==0 ):
@@ -34,11 +35,11 @@ def fetch_all_publications(subdomain):
     for publication in data:
       publications.append(publication)
     offset = len(publications)
-  
+
   with open(file_path, 'w+', encoding='utf-8') as json_file:
     json.dump(publications, json_file, ensure_ascii=True, indent=2)
     print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
-  
+
   return publications
 
 def only_valid_publications(publications= []):
@@ -60,7 +61,7 @@ def get_content(article_link):
   if(req.ok == False):
     print("Could not reach this url!")
     return None
-  
+
   req.html.render()
 
   full_text = None
@@ -75,6 +76,7 @@ def get_content(article_link):
 
 def append_meta(publication, text):
   meta = {
+    'id': guid(),
     'url': publication.get('canonical_url'),
     'thumbnail': publication.get('cover_image'),
     'title': publication.get('title'),
diff --git a/collector/scripts/twitter.py b/collector/scripts/twitter.py
index a5c02948..4c085071 100644
--- a/collector/scripts/twitter.py
+++ b/collector/scripts/twitter.py
@@ -7,13 +7,14 @@ import os, time
 import pandas as pd
 import json
 from .utils import tokenize, ada_v2_cost
+from .watch.utils import guid
 
 def twitter():
     #get user and number of tweets to read
     username = input("user timeline to read from (blank to ignore): ")
     searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
     tweetCount = input("Gather the last number of tweets: ")
-    
+
     # Read your API keys to call the API.
     consumer_key = os.environ.get("TW_CONSUMER_KEY")
     consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
@@ -43,7 +44,7 @@ def twitter():
             [tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
             for tweet in tweets
         ]
-        
+
         # Creation of column list to rename the columns in the dataframe
         columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
 
@@ -76,7 +77,7 @@ def twitter():
 
             with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
                 json.dump(meta_link, file, ensure_ascii=True, indent=4)
-            
+
             # print(f"{transaction_output_dir}/{transaction_output_filename}")
 
         print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
@@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False):
   url = f"http://twitter.com/anyuser/status/{row['id']}"
   title = f"Tweet {row['id']}"
   meta = {
+    'id': guid(),
     'url': url,
     'title':  title,
     'description': 'Tweet from ' + row["Screen Name"],
diff --git a/collector/scripts/yt_utils.py b/collector/scripts/yt_utils.py
index b4c74bbf..c13fb26e 100644
--- a/collector/scripts/yt_utils.py
+++ b/collector/scripts/yt_utils.py
@@ -1,6 +1,7 @@
 import json, requests, os, re
 from slugify import slugify
 from dotenv import load_dotenv
+from .watch.utils import guid
 load_dotenv()
 
 def is_yt_short(videoId):
@@ -20,13 +21,13 @@ def get_channel_id(channel_link):
     if(response.ok == False):
       print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
       return None
-  
+
     json_data = response.json()
     return json_data.get('items')[0].get('id')
   else:
     pattern = r"youtube\.com/channel/([\w-]+)"
     match = re.search(pattern, channel_link)
-    return match.group(1) if match else None 
+    return match.group(1) if match else None
 
 
 def clean_text(text):
@@ -34,6 +35,7 @@ def clean_text(text):
 
 def append_meta(video, duration, text):
   meta = {
+    'id': guid(),
     'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
     'thumbnail': video.get('thumbnail'),
     'description': video.get('description'),
@@ -63,7 +65,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
     if(os.getenv('GOOGLE_APIS_KEY') == None):
         print("GOOGLE_APIS_KEY env variable not set!")
         exit(1)
-    
+
     done = False
     currentPage = None
     pageTokens = []
@@ -93,7 +95,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
 
         for item in response.get('items'):
           if 'id' in item and 'videoId' in item.get('id'):
-            if is_yt_short(item.get('id').get('videoId')): 
+            if is_yt_short(item.get('id').get('videoId')):
               print(f"Filtering out YT Short {item.get('id').get('videoId')}")
               continue
 
@@ -109,12 +111,12 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
               'published': item.get('snippet').get('publishTime'),
             }
             items.append(newItem)
-        
+
         pageTokens.append(currentPage)
 
     data['items'] = items
     with open(file_path, 'w+', encoding='utf-8') as json_file:
       json.dump(data, json_file, ensure_ascii=True, indent=2)
       print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
-    
+
     return data
\ No newline at end of file