Add id to all metadata to prevent errors in frontend document picker (#378)

add id to all metadata to prevent errors in frontend docuemnt picker Co-authored-by: timothycarambat <rambat1010@gmail.com>
2024-07-07 09:30:07 +02:00 · 2023-11-16 14:36:26 -08:00 · 2023-11-16 14:36:26 -08:00 · f40309cfdb
commit f40309cfdb
parent 73f342eb19
3 changed files with 20 additions and 14 deletions
--- a/collector/scripts/substack_utils.py
+++ b/collector/scripts/substack_utils.py
@ -1,6 +1,7 @@
 import os, json, requests, tempfile
 from requests_html import HTMLSession
 from langchain.document_loaders import UnstructuredHTMLLoader
+from .watch.utils import guid

 def fetch_all_publications(subdomain):
  file_path = f"./outputs/substack-logs/substack-{subdomain}.json"
@ -75,6 +76,7 @@ def get_content(article_link):

 def append_meta(publication, text):
  meta = {
+    'id': guid(),
    'url': publication.get('canonical_url'),
    'thumbnail': publication.get('cover_image'),
    'title': publication.get('title'),
--- a/collector/scripts/twitter.py
+++ b/collector/scripts/twitter.py
@ -7,6 +7,7 @@ import os, time
 import pandas as pd
 import json
 from .utils import tokenize, ada_v2_cost
+from .watch.utils import guid

 def twitter():
    #get user and number of tweets to read
@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False):
  url = f"http://twitter.com/anyuser/status/{row['id']}"
  title = f"Tweet {row['id']}"
  meta = {
+    'id': guid(),
    'url': url,
    'title':  title,
    'description': 'Tweet from ' + row["Screen Name"],
--- a/collector/scripts/yt_utils.py
+++ b/collector/scripts/yt_utils.py
@ -1,6 +1,7 @@
 import json, requests, os, re
 from slugify import slugify
 from dotenv import load_dotenv
+from .watch.utils import guid
 load_dotenv()

 def is_yt_short(videoId):
@ -34,6 +35,7 @@ def clean_text(text):

 def append_meta(video, duration, text):
  meta = {
+    'id': guid(),
    'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
    'thumbnail': video.get('thumbnail'),
    'description': video.get('description'),