Add id to all metadata to prevent errors in frontend document picker (#378)

add id to all metadata to prevent errors in frontend docuemnt picker Co-authored-by: timothycarambat <rambat1010@gmail.com>
2024-07-04 16:20:12 +02:00 · 2023-11-16 14:36:26 -08:00 · 2023-11-16 14:36:26 -08:00 · f40309cfdb
commit f40309cfdb
parent 73f342eb19
3 changed files with 20 additions and 14 deletions
--- a/collector/scripts/substack_utils.py
+++ b/collector/scripts/substack_utils.py
@ -1,13 +1,14 @@
 import os, json, requests, tempfile
 from requests_html import HTMLSession
 from langchain.document_loaders import UnstructuredHTMLLoader
+from .watch.utils import guid

 def fetch_all_publications(subdomain):
  file_path = f"./outputs/substack-logs/substack-{subdomain}.json"

  if os.path.isdir("./outputs/substack-logs") == False:
    os.makedirs("./outputs/substack-logs")
-  
+
  if os.path.exists(file_path):
    with open(file_path, "r") as file:
      print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
@ -24,7 +25,7 @@ def fetch_all_publications(subdomain):
      print("Bad response - exiting collection")
      collecting = False
      continue
-    
+
    data = response.json()

    if(len(data) ==0 ):
@ -34,11 +35,11 @@ def fetch_all_publications(subdomain):
    for publication in data:
      publications.append(publication)
    offset = len(publications)
-  
+
  with open(file_path, 'w+', encoding='utf-8') as json_file:
    json.dump(publications, json_file, ensure_ascii=True, indent=2)
    print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
-  
+
  return publications

 def only_valid_publications(publications= []):
@ -60,7 +61,7 @@ def get_content(article_link):
  if(req.ok == False):
    print("Could not reach this url!")
    return None
-  
+
  req.html.render()

  full_text = None
@ -75,6 +76,7 @@ def get_content(article_link):

 def append_meta(publication, text):
  meta = {
+    'id': guid(),
    'url': publication.get('canonical_url'),
    'thumbnail': publication.get('cover_image'),
    'title': publication.get('title'),
--- a/collector/scripts/twitter.py
+++ b/collector/scripts/twitter.py
@ -7,13 +7,14 @@ import os, time
 import pandas as pd
 import json
 from .utils import tokenize, ada_v2_cost
+from .watch.utils import guid

 def twitter():
    #get user and number of tweets to read
    username = input("user timeline to read from (blank to ignore): ")
    searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
    tweetCount = input("Gather the last number of tweets: ")
-    
+
    # Read your API keys to call the API.
    consumer_key = os.environ.get("TW_CONSUMER_KEY")
    consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
@ -43,7 +44,7 @@ def twitter():
            [tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
            for tweet in tweets
        ]
-        
+
        # Creation of column list to rename the columns in the dataframe
        columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]

@ -76,7 +77,7 @@ def twitter():

            with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
                json.dump(meta_link, file, ensure_ascii=True, indent=4)
-            
+
            # print(f"{transaction_output_dir}/{transaction_output_filename}")

        print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False):
  url = f"http://twitter.com/anyuser/status/{row['id']}"
  title = f"Tweet {row['id']}"
  meta = {
+    'id': guid(),
    'url': url,
    'title':  title,
    'description': 'Tweet from ' + row["Screen Name"],
--- a/collector/scripts/yt_utils.py
+++ b/collector/scripts/yt_utils.py
@ -1,6 +1,7 @@
 import json, requests, os, re
 from slugify import slugify
 from dotenv import load_dotenv
+from .watch.utils import guid
 load_dotenv()

 def is_yt_short(videoId):
@ -20,13 +21,13 @@ def get_channel_id(channel_link):
    if(response.ok == False):
      print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
      return None
-  
+
    json_data = response.json()
    return json_data.get('items')[0].get('id')
  else:
    pattern = r"youtube\.com/channel/([\w-]+)"
    match = re.search(pattern, channel_link)
-    return match.group(1) if match else None 
+    return match.group(1) if match else None


 def clean_text(text):
@ -34,6 +35,7 @@ def clean_text(text):

 def append_meta(video, duration, text):
  meta = {
+    'id': guid(),
    'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
    'thumbnail': video.get('thumbnail'),
    'description': video.get('description'),
@ -63,7 +65,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
    if(os.getenv('GOOGLE_APIS_KEY') == None):
        print("GOOGLE_APIS_KEY env variable not set!")
        exit(1)
-    
+
    done = False
    currentPage = None
    pageTokens = []
@ -93,7 +95,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):

        for item in response.get('items'):
          if 'id' in item and 'videoId' in item.get('id'):
-            if is_yt_short(item.get('id').get('videoId')): 
+            if is_yt_short(item.get('id').get('videoId')):
              print(f"Filtering out YT Short {item.get('id').get('videoId')}")
              continue

@ -109,12 +111,12 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
              'published': item.get('snippet').get('publishTime'),
            }
            items.append(newItem)
-        
+
        pageTokens.append(currentPage)

    data['items'] = items
    with open(file_path, 'w+', encoding='utf-8') as json_file:
      json.dump(data, json_file, ensure_ascii=True, indent=2)
      print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
-    
+
    return data