mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-10-04 01:40:12 +02:00
Add id to all metadata to prevent errors in frontend document picker (#378)
add id to all metadata to prevent errors in frontend docuemnt picker Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
73f342eb19
commit
f40309cfdb
@ -1,13 +1,14 @@
|
|||||||
import os, json, requests, tempfile
|
import os, json, requests, tempfile
|
||||||
from requests_html import HTMLSession
|
from requests_html import HTMLSession
|
||||||
from langchain.document_loaders import UnstructuredHTMLLoader
|
from langchain.document_loaders import UnstructuredHTMLLoader
|
||||||
|
from .watch.utils import guid
|
||||||
|
|
||||||
def fetch_all_publications(subdomain):
|
def fetch_all_publications(subdomain):
|
||||||
file_path = f"./outputs/substack-logs/substack-{subdomain}.json"
|
file_path = f"./outputs/substack-logs/substack-{subdomain}.json"
|
||||||
|
|
||||||
if os.path.isdir("./outputs/substack-logs") == False:
|
if os.path.isdir("./outputs/substack-logs") == False:
|
||||||
os.makedirs("./outputs/substack-logs")
|
os.makedirs("./outputs/substack-logs")
|
||||||
|
|
||||||
if os.path.exists(file_path):
|
if os.path.exists(file_path):
|
||||||
with open(file_path, "r") as file:
|
with open(file_path, "r") as file:
|
||||||
print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
|
print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
|
||||||
@ -24,7 +25,7 @@ def fetch_all_publications(subdomain):
|
|||||||
print("Bad response - exiting collection")
|
print("Bad response - exiting collection")
|
||||||
collecting = False
|
collecting = False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
if(len(data) ==0 ):
|
if(len(data) ==0 ):
|
||||||
@ -34,11 +35,11 @@ def fetch_all_publications(subdomain):
|
|||||||
for publication in data:
|
for publication in data:
|
||||||
publications.append(publication)
|
publications.append(publication)
|
||||||
offset = len(publications)
|
offset = len(publications)
|
||||||
|
|
||||||
with open(file_path, 'w+', encoding='utf-8') as json_file:
|
with open(file_path, 'w+', encoding='utf-8') as json_file:
|
||||||
json.dump(publications, json_file, ensure_ascii=True, indent=2)
|
json.dump(publications, json_file, ensure_ascii=True, indent=2)
|
||||||
print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
|
print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
|
||||||
|
|
||||||
return publications
|
return publications
|
||||||
|
|
||||||
def only_valid_publications(publications= []):
|
def only_valid_publications(publications= []):
|
||||||
@ -60,7 +61,7 @@ def get_content(article_link):
|
|||||||
if(req.ok == False):
|
if(req.ok == False):
|
||||||
print("Could not reach this url!")
|
print("Could not reach this url!")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
req.html.render()
|
req.html.render()
|
||||||
|
|
||||||
full_text = None
|
full_text = None
|
||||||
@ -75,6 +76,7 @@ def get_content(article_link):
|
|||||||
|
|
||||||
def append_meta(publication, text):
|
def append_meta(publication, text):
|
||||||
meta = {
|
meta = {
|
||||||
|
'id': guid(),
|
||||||
'url': publication.get('canonical_url'),
|
'url': publication.get('canonical_url'),
|
||||||
'thumbnail': publication.get('cover_image'),
|
'thumbnail': publication.get('cover_image'),
|
||||||
'title': publication.get('title'),
|
'title': publication.get('title'),
|
||||||
|
@ -7,13 +7,14 @@ import os, time
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import json
|
import json
|
||||||
from .utils import tokenize, ada_v2_cost
|
from .utils import tokenize, ada_v2_cost
|
||||||
|
from .watch.utils import guid
|
||||||
|
|
||||||
def twitter():
|
def twitter():
|
||||||
#get user and number of tweets to read
|
#get user and number of tweets to read
|
||||||
username = input("user timeline to read from (blank to ignore): ")
|
username = input("user timeline to read from (blank to ignore): ")
|
||||||
searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
|
searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
|
||||||
tweetCount = input("Gather the last number of tweets: ")
|
tweetCount = input("Gather the last number of tweets: ")
|
||||||
|
|
||||||
# Read your API keys to call the API.
|
# Read your API keys to call the API.
|
||||||
consumer_key = os.environ.get("TW_CONSUMER_KEY")
|
consumer_key = os.environ.get("TW_CONSUMER_KEY")
|
||||||
consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
|
consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
|
||||||
@ -43,7 +44,7 @@ def twitter():
|
|||||||
[tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
|
[tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
|
||||||
for tweet in tweets
|
for tweet in tweets
|
||||||
]
|
]
|
||||||
|
|
||||||
# Creation of column list to rename the columns in the dataframe
|
# Creation of column list to rename the columns in the dataframe
|
||||||
columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
|
columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
|
||||||
|
|
||||||
@ -76,7 +77,7 @@ def twitter():
|
|||||||
|
|
||||||
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
||||||
json.dump(meta_link, file, ensure_ascii=True, indent=4)
|
json.dump(meta_link, file, ensure_ascii=True, indent=4)
|
||||||
|
|
||||||
# print(f"{transaction_output_dir}/{transaction_output_filename}")
|
# print(f"{transaction_output_dir}/{transaction_output_filename}")
|
||||||
|
|
||||||
print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
|
print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
|
||||||
@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False):
|
|||||||
url = f"http://twitter.com/anyuser/status/{row['id']}"
|
url = f"http://twitter.com/anyuser/status/{row['id']}"
|
||||||
title = f"Tweet {row['id']}"
|
title = f"Tweet {row['id']}"
|
||||||
meta = {
|
meta = {
|
||||||
|
'id': guid(),
|
||||||
'url': url,
|
'url': url,
|
||||||
'title': title,
|
'title': title,
|
||||||
'description': 'Tweet from ' + row["Screen Name"],
|
'description': 'Tweet from ' + row["Screen Name"],
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import json, requests, os, re
|
import json, requests, os, re
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
from .watch.utils import guid
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
def is_yt_short(videoId):
|
def is_yt_short(videoId):
|
||||||
@ -20,13 +21,13 @@ def get_channel_id(channel_link):
|
|||||||
if(response.ok == False):
|
if(response.ok == False):
|
||||||
print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
|
print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
json_data = response.json()
|
json_data = response.json()
|
||||||
return json_data.get('items')[0].get('id')
|
return json_data.get('items')[0].get('id')
|
||||||
else:
|
else:
|
||||||
pattern = r"youtube\.com/channel/([\w-]+)"
|
pattern = r"youtube\.com/channel/([\w-]+)"
|
||||||
match = re.search(pattern, channel_link)
|
match = re.search(pattern, channel_link)
|
||||||
return match.group(1) if match else None
|
return match.group(1) if match else None
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text):
|
def clean_text(text):
|
||||||
@ -34,6 +35,7 @@ def clean_text(text):
|
|||||||
|
|
||||||
def append_meta(video, duration, text):
|
def append_meta(video, duration, text):
|
||||||
meta = {
|
meta = {
|
||||||
|
'id': guid(),
|
||||||
'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
|
'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
|
||||||
'thumbnail': video.get('thumbnail'),
|
'thumbnail': video.get('thumbnail'),
|
||||||
'description': video.get('description'),
|
'description': video.get('description'),
|
||||||
@ -63,7 +65,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
|
|||||||
if(os.getenv('GOOGLE_APIS_KEY') == None):
|
if(os.getenv('GOOGLE_APIS_KEY') == None):
|
||||||
print("GOOGLE_APIS_KEY env variable not set!")
|
print("GOOGLE_APIS_KEY env variable not set!")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
done = False
|
done = False
|
||||||
currentPage = None
|
currentPage = None
|
||||||
pageTokens = []
|
pageTokens = []
|
||||||
@ -93,7 +95,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
|
|||||||
|
|
||||||
for item in response.get('items'):
|
for item in response.get('items'):
|
||||||
if 'id' in item and 'videoId' in item.get('id'):
|
if 'id' in item and 'videoId' in item.get('id'):
|
||||||
if is_yt_short(item.get('id').get('videoId')):
|
if is_yt_short(item.get('id').get('videoId')):
|
||||||
print(f"Filtering out YT Short {item.get('id').get('videoId')}")
|
print(f"Filtering out YT Short {item.get('id').get('videoId')}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -109,12 +111,12 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
|
|||||||
'published': item.get('snippet').get('publishTime'),
|
'published': item.get('snippet').get('publishTime'),
|
||||||
}
|
}
|
||||||
items.append(newItem)
|
items.append(newItem)
|
||||||
|
|
||||||
pageTokens.append(currentPage)
|
pageTokens.append(currentPage)
|
||||||
|
|
||||||
data['items'] = items
|
data['items'] = items
|
||||||
with open(file_path, 'w+', encoding='utf-8') as json_file:
|
with open(file_path, 'w+', encoding='utf-8') as json_file:
|
||||||
json.dump(data, json_file, ensure_ascii=True, indent=2)
|
json.dump(data, json_file, ensure_ascii=True, indent=2)
|
||||||
print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
|
print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
|
||||||
|
|
||||||
return data
|
return data
|
Loading…
Reference in New Issue
Block a user