Add id to all metadata to prevent errors in frontend document picker (#378)

add id to all metadata to prevent errors in frontend docuemnt picker

Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield 2023-11-16 14:36:26 -08:00 committed by GitHub
parent 73f342eb19
commit f40309cfdb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 14 deletions

View File

@ -1,13 +1,14 @@
import os, json, requests, tempfile
from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader
from .watch.utils import guid
def fetch_all_publications(subdomain):
file_path = f"./outputs/substack-logs/substack-{subdomain}.json"
if os.path.isdir("./outputs/substack-logs") == False:
os.makedirs("./outputs/substack-logs")
if os.path.exists(file_path):
with open(file_path, "r") as file:
print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
@ -24,7 +25,7 @@ def fetch_all_publications(subdomain):
print("Bad response - exiting collection")
collecting = False
continue
data = response.json()
if(len(data) ==0 ):
@ -34,11 +35,11 @@ def fetch_all_publications(subdomain):
for publication in data:
publications.append(publication)
offset = len(publications)
with open(file_path, 'w+', encoding='utf-8') as json_file:
json.dump(publications, json_file, ensure_ascii=True, indent=2)
print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
return publications
def only_valid_publications(publications= []):
@ -60,7 +61,7 @@ def get_content(article_link):
if(req.ok == False):
print("Could not reach this url!")
return None
req.html.render()
full_text = None
@ -75,6 +76,7 @@ def get_content(article_link):
def append_meta(publication, text):
meta = {
'id': guid(),
'url': publication.get('canonical_url'),
'thumbnail': publication.get('cover_image'),
'title': publication.get('title'),

View File

@ -7,13 +7,14 @@ import os, time
import pandas as pd
import json
from .utils import tokenize, ada_v2_cost
from .watch.utils import guid
def twitter():
#get user and number of tweets to read
username = input("user timeline to read from (blank to ignore): ")
searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
tweetCount = input("Gather the last number of tweets: ")
# Read your API keys to call the API.
consumer_key = os.environ.get("TW_CONSUMER_KEY")
consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
@ -43,7 +44,7 @@ def twitter():
[tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
for tweet in tweets
]
# Creation of column list to rename the columns in the dataframe
columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
@ -76,7 +77,7 @@ def twitter():
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
json.dump(meta_link, file, ensure_ascii=True, indent=4)
# print(f"{transaction_output_dir}/{transaction_output_filename}")
print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False):
url = f"http://twitter.com/anyuser/status/{row['id']}"
title = f"Tweet {row['id']}"
meta = {
'id': guid(),
'url': url,
'title': title,
'description': 'Tweet from ' + row["Screen Name"],

View File

@ -1,6 +1,7 @@
import json, requests, os, re
from slugify import slugify
from dotenv import load_dotenv
from .watch.utils import guid
load_dotenv()
def is_yt_short(videoId):
@ -20,13 +21,13 @@ def get_channel_id(channel_link):
if(response.ok == False):
print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
return None
json_data = response.json()
return json_data.get('items')[0].get('id')
else:
pattern = r"youtube\.com/channel/([\w-]+)"
match = re.search(pattern, channel_link)
return match.group(1) if match else None
return match.group(1) if match else None
def clean_text(text):
@ -34,6 +35,7 @@ def clean_text(text):
def append_meta(video, duration, text):
meta = {
'id': guid(),
'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
'thumbnail': video.get('thumbnail'),
'description': video.get('description'),
@ -63,7 +65,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
if(os.getenv('GOOGLE_APIS_KEY') == None):
print("GOOGLE_APIS_KEY env variable not set!")
exit(1)
done = False
currentPage = None
pageTokens = []
@ -93,7 +95,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
for item in response.get('items'):
if 'id' in item and 'videoId' in item.get('id'):
if is_yt_short(item.get('id').get('videoId')):
if is_yt_short(item.get('id').get('videoId')):
print(f"Filtering out YT Short {item.get('id').get('videoId')}")
continue
@ -109,12 +111,12 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
'published': item.get('snippet').get('publishTime'),
}
items.append(newItem)
pageTokens.append(currentPage)
data['items'] = items
with open(file_path, 'w+', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=True, indent=2)
print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
return data