mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2024-11-05 06:20:10 +01:00
Merge branch 'master' of github.com:Mintplex-Labs/anything-llm
This commit is contained in:
commit
c6d030f390
@ -48,4 +48,15 @@ Now uploads from the frontend will be processed as if you ran the `watch.py` scr
|
||||
- ![GCP Project Bar](../images/gcp-project-bar.png)
|
||||
- [Enable YouTube Data APIV3](https://console.cloud.google.com/apis/library/youtube.googleapis.com)
|
||||
- Once enabled generate a Credential key for this API
|
||||
- Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file.
|
||||
- Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file.
|
||||
|
||||
### Using ther Twitter API
|
||||
***required to get data form twitter with tweepy**
|
||||
- Go to https://developer.twitter.com/en/portal/dashboard with your twitter account
|
||||
- Create a new Project App
|
||||
- Get your 4 keys and place them in your `collector.env` file
|
||||
* TW_CONSUMER_KEY
|
||||
* TW_CONSUMER_SECRET
|
||||
* TW_ACCESS_TOKEN
|
||||
* TW_ACCESS_TOKEN_SECRET
|
||||
populate the .env with the values
|
||||
|
@ -6,6 +6,7 @@ from scripts.substack import substack
|
||||
from scripts.medium import medium
|
||||
from scripts.gitbook import gitbook
|
||||
from scripts.sitemap import sitemap
|
||||
from scripts.twitter import twitter
|
||||
|
||||
def main():
|
||||
if os.name == 'nt':
|
||||
@ -15,7 +16,8 @@ def main():
|
||||
'3': 'Substack',
|
||||
'4': 'Medium',
|
||||
'5': 'Gitbook',
|
||||
'6': 'Sitemap',
|
||||
'6': 'Twitter',
|
||||
'7': 'Sitemap',
|
||||
}
|
||||
print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
|
||||
print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
|
||||
@ -30,7 +32,7 @@ def main():
|
||||
{"name": "Medium", "value": "Medium"},
|
||||
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
|
||||
{"name": "Gitbook", "value": "Gitbook"},
|
||||
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
|
||||
{"name": "Twitter", "value": "Twitter"},
|
||||
{"name": "Sitemap", "value": "Sitemap"},
|
||||
{"name": "Abort", "value": "Abort"},
|
||||
],
|
||||
@ -71,8 +73,10 @@ def main():
|
||||
exit(0)
|
||||
if method == 'Sitemap':
|
||||
sitemap()
|
||||
exit(0)
|
||||
if method == 'Twitter':
|
||||
twitter()
|
||||
exit(0)
|
||||
|
||||
print("Selection was not valid.")
|
||||
exit(1)
|
||||
|
||||
|
@ -109,4 +109,5 @@ xlrd==2.0.1
|
||||
XlsxWriter==3.1.2
|
||||
yarl==1.9.2
|
||||
youtube-transcript-api==0.6.0
|
||||
zipp==3.15.0
|
||||
zipp==3.15.0
|
||||
tweepy==4.14.0
|
||||
|
@ -69,7 +69,7 @@ def link():
|
||||
def crawler():
|
||||
prompt = "Paste in root URI of the pages of interest: "
|
||||
new_link = input(prompt)
|
||||
filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
|
||||
filter_value = input("Add a filter value for the url to ensure links don't wander too far. eg: 'my-domain.com': ")
|
||||
#extract this from the uri provided
|
||||
root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
|
||||
links = []
|
||||
@ -82,11 +82,16 @@ def crawler():
|
||||
for link in soup.find_all("a"):
|
||||
data = link.get('href')
|
||||
if (data is not None):
|
||||
if filter_value in data:
|
||||
data = data.strip()
|
||||
print (data)
|
||||
links.append(root_site + data)
|
||||
else:
|
||||
fullpath = data if data[0] != '/' else f"{root_site}{data}"
|
||||
try:
|
||||
destination = urlparse(fullpath).scheme + "://" + urlparse(fullpath).hostname + (urlparse(fullpath).path if urlparse(fullpath).path is not None else '')
|
||||
if filter_value in destination:
|
||||
data = destination.strip()
|
||||
print (data)
|
||||
links.append(data)
|
||||
else:
|
||||
print (data + " does not apply for linking...")
|
||||
except:
|
||||
print (data + " does not apply for linking...")
|
||||
#parse the links found
|
||||
parse_links(links)
|
||||
|
101
collector/scripts/twitter.py
Normal file
101
collector/scripts/twitter.py
Normal file
@ -0,0 +1,101 @@
|
||||
"""
|
||||
Tweepy implementation of twitter reader. Requires the 4 twitter keys to operate.
|
||||
"""
|
||||
|
||||
import tweepy
|
||||
import os, time
|
||||
import pandas as pd
|
||||
import json
|
||||
from .utils import tokenize, ada_v2_cost
|
||||
|
||||
def twitter():
|
||||
#get user and number of tweets to read
|
||||
username = input("user timeline to read from (blank to ignore): ")
|
||||
searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
|
||||
tweetCount = input("Gather the last number of tweets: ")
|
||||
|
||||
# Read your API keys to call the API.
|
||||
consumer_key = os.environ.get("TW_CONSUMER_KEY")
|
||||
consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
|
||||
access_token = os.environ.get("TW_ACCESS_TOKEN")
|
||||
access_token_secret = os.environ.get("TW_ACCESS_TOKEN_SECRET")
|
||||
|
||||
# Check if any of the required environment variables is missing.
|
||||
if not consumer_key or not consumer_secret or not access_token or not access_token_secret:
|
||||
raise EnvironmentError("One of the twitter API environment variables are missing.")
|
||||
|
||||
# Pass in our twitter API authentication key
|
||||
auth = tweepy.OAuth1UserHandler(
|
||||
consumer_key, consumer_secret, access_token, access_token_secret
|
||||
)
|
||||
|
||||
# Instantiate the tweepy API
|
||||
api = tweepy.API(auth, wait_on_rate_limit=True)
|
||||
|
||||
try:
|
||||
if (searchQuery == ''):
|
||||
tweets = api.user_timeline(screen_name=username, tweet_mode = 'extended', count=tweetCount)
|
||||
else:
|
||||
tweets = api.search_tweets(q=searchQuery, tweet_mode = 'extended', count=tweetCount)
|
||||
|
||||
# Pulling Some attributes from the tweet
|
||||
attributes_container = [
|
||||
[tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
|
||||
for tweet in tweets
|
||||
]
|
||||
|
||||
# Creation of column list to rename the columns in the dataframe
|
||||
columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
|
||||
|
||||
# Creation of Dataframe
|
||||
tweets_df = pd.DataFrame(attributes_container, columns=columns)
|
||||
|
||||
totalTokens = 0
|
||||
for index, row in tweets_df.iterrows():
|
||||
meta_link = twitter_meta(row, True)
|
||||
output_filename = f"twitter-{username}-{row['Date Created']}.json"
|
||||
output_path = f"./outputs/twitter-logs"
|
||||
|
||||
transaction_output_filename = f"tweet-{username}-{row['id']}.json"
|
||||
transaction_output_dir = f"../server/storage/documents/twitter-{username}"
|
||||
|
||||
if not os.path.isdir(output_path):
|
||||
os.makedirs(output_path)
|
||||
|
||||
if not os.path.isdir(transaction_output_dir):
|
||||
os.makedirs(transaction_output_dir)
|
||||
|
||||
full_text = twitter_meta(row)
|
||||
tokenCount = len(tokenize(full_text))
|
||||
meta_link['pageContent'] = full_text
|
||||
meta_link['token_count_estimate'] = tokenCount
|
||||
totalTokens += tokenCount
|
||||
|
||||
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(meta_link, file, ensure_ascii=True, indent=4)
|
||||
|
||||
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
|
||||
json.dump(meta_link, file, ensure_ascii=True, indent=4)
|
||||
|
||||
# print(f"{transaction_output_dir}/{transaction_output_filename}")
|
||||
|
||||
print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
|
||||
|
||||
except BaseException as e:
|
||||
print("Status Failed: ", str(e))
|
||||
time.sleep(3)
|
||||
|
||||
|
||||
def twitter_meta(row, metadata_only = False):
|
||||
# Note that /anyuser is a known twitter hack for not knowing the user's handle
|
||||
# https://stackoverflow.com/questions/897107/can-i-fetch-the-tweet-from-twitter-if-i-know-the-tweets-id
|
||||
url = f"http://twitter.com/anyuser/status/{row['id']}"
|
||||
title = f"Tweet {row['id']}"
|
||||
meta = {
|
||||
'url': url,
|
||||
'title': title,
|
||||
'description': 'Tweet from ' + row["Screen Name"],
|
||||
'published': row["Date Created"].strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'wordCount': len(row["Tweet"]),
|
||||
}
|
||||
return "Tweet JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + row["Tweet"] if metadata_only == False else meta
|
Loading…
Reference in New Issue
Block a user