anything-llm/collector/scripts/yt_utils.py

import json, requests, os, re
from slugify import slugify
from dotenv import load_dotenv
load_dotenv()

def is_yt_short(videoId):
    url = 'https://www.youtube.com/shorts/' + videoId
    ret = requests.head(url)
    return ret.status_code == 200

def get_channel_id(channel_link):
  if('@' in channel_link):
    pattern = r'https?://www\.youtube\.com/(@\w+)/?'
    match = re.match(pattern, channel_link)
    if match is False: return None
    handle = match.group(1)
    print('Need to map username to channelId - this can take a while sometimes.')
    response = requests.get(f"https://yt.lemnoslife.com/channels?handle={handle}", timeout=20)

    if(response.ok == False):
      print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
      return None
  
    json_data = response.json()
    return json_data.get('items')[0].get('id')
  else:
    pattern = r"youtube\.com/channel/([\w-]+)"
    match = re.search(pattern, channel_link)
    return match.group(1) if match else None 


def clean_text(text):
  return re.sub(r"\[.*?\]", "", text)

def append_meta(video, duration, text):
  meta = {
    'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
    'thumbnail': video.get('thumbnail'),
    'description': video.get('description'),
    'createdAt': video.get('published'),
    'videoDurationInSeconds': duration,
  }
  return "Video JSON Metadata:\n"+json.dumps(meta, indent=4)+"\n\n\nAudio Transcript:\n" + text

def get_duration(json_str):
  data = json.loads(json_str)
  return data[-1].get('start')

def fetch_channel_video_information(channel_id, windowSize = 50):
    if channel_id == None or len(channel_id) == 0:
      print("No channel id provided!")
      exit(1)

    if os.path.isdir("./outputs/channel-logs") == False:
      os.makedirs("./outputs/channel-logs")

    file_path = f"./outputs/channel-logs/channel-{channel_id}.json"
    if os.path.exists(file_path):
      with open(file_path, "r") as file:
        print(f"Returning cached data for channel {channel_id}. If you do not wish to use stored data then delete the file for this channel to allow refetching.")
        return json.load(file)

    if(os.getenv('GOOGLE_APIS_KEY') == None):
        print("GOOGLE_APIS_KEY env variable not set!")
        exit(1)
    
    done = False
    currentPage = None
    pageTokens = []
    items = []
    data = {
       'id': channel_id,
    }

    print("Fetching first page of results...")
    while(done == False):
        url = f"https://www.googleapis.com/youtube/v3/search?key={os.getenv('GOOGLE_APIS_KEY')}&channelId={channel_id}&part=snippet,id&order=date&type=video&maxResults={windowSize}"
        if(currentPage != None):
           print(f"Fetching page ${currentPage}")
           url += f"&pageToken={currentPage}"

        req = requests.get(url)
        if(req.ok == False):
           print("Could not fetch channel_id items!")
           exit(1)

        response = req.json()
        currentPage = response.get('nextPageToken')
        if currentPage in pageTokens:
          print('All pages iterated and logged!')
          done = True
          break

        for item in response.get('items'):
          if 'id' in item and 'videoId' in item.get('id'):
            if is_yt_short(item.get('id').get('videoId')): 
              print(f"Filtering out YT Short {item.get('id').get('videoId')}")
              continue

            if data.get('channelTitle') is None:
              data['channelTitle'] = slugify(item.get('snippet').get('channelTitle'))

            newItem = {
              'id': item.get('id').get('videoId'),
              'url': f"https://youtube.com/watch?v={item.get('id').get('videoId')}",
              'title': item.get('snippet').get('title'),
              'description':  item.get('snippet').get('description'),
              'thumbnail': item.get('snippet').get('thumbnails').get('high').get('url'),
              'published': item.get('snippet').get('publishTime'),
            }
            items.append(newItem)
        
        pageTokens.append(currentPage)

    data['items'] = items
    with open(file_path, 'w+', encoding='utf-8') as json_file:
      json.dump(data, json_file, ensure_ascii=True, indent=2)
      print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
    
    return data
inital commit ⚡ 2023-06-04 04:28:07 +02:00			`import json, requests, os, re`
			`from slugify import slugify`
			`from dotenv import load_dotenv`
			`load_dotenv()`

			`def is_yt_short(videoId):`
			`url = 'https://www.youtube.com/shorts/' + videoId`
			`ret = requests.head(url)`
			`return ret.status_code == 200`

			`def get_channel_id(channel_link):`
			`if('@' in channel_link):`
			`pattern = r'https?://www\.youtube\.com/(@\w+)/?'`
			`match = re.match(pattern, channel_link)`
			`if match is False: return None`
			`handle = match.group(1)`
			`print('Need to map username to channelId - this can take a while sometimes.')`
			`response = requests.get(f"https://yt.lemnoslife.com/channels?handle={handle}", timeout=20)`

			`if(response.ok == False):`
			`print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")`
			`return None`

			`json_data = response.json()`
			`return json_data.get('items')[0].get('id')`
			`else:`
			`pattern = r"youtube\.com/channel/([\w-]+)"`
			`match = re.search(pattern, channel_link)`
			`return match.group(1) if match else None`


			`def clean_text(text):`
			`return re.sub(r"\[.*?\]", "", text)`

			`def append_meta(video, duration, text):`
			`meta = {`
			`'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",`
			`'thumbnail': video.get('thumbnail'),`
			`'description': video.get('description'),`
			`'createdAt': video.get('published'),`
			`'videoDurationInSeconds': duration,`
			`}`
			`return "Video JSON Metadata:\n"+json.dumps(meta, indent=4)+"\n\n\nAudio Transcript:\n" + text`

			`def get_duration(json_str):`
			`data = json.loads(json_str)`
			`return data[-1].get('start')`

			`def fetch_channel_video_information(channel_id, windowSize = 50):`
			`if channel_id == None or len(channel_id) == 0:`
			`print("No channel id provided!")`
			`exit(1)`

			`if os.path.isdir("./outputs/channel-logs") == False:`
			`os.makedirs("./outputs/channel-logs")`

			`file_path = f"./outputs/channel-logs/channel-{channel_id}.json"`
			`if os.path.exists(file_path):`
			`with open(file_path, "r") as file:`
			`print(f"Returning cached data for channel {channel_id}. If you do not wish to use stored data then delete the file for this channel to allow refetching.")`
			`return json.load(file)`

			`if(os.getenv('GOOGLE_APIS_KEY') == None):`
			`print("GOOGLE_APIS_KEY env variable not set!")`
			`exit(1)`

			`done = False`
			`currentPage = None`
			`pageTokens = []`
			`items = []`
			`data = {`
			`'id': channel_id,`
			`}`

			`print("Fetching first page of results...")`
			`while(done == False):`
			`url = f"https://www.googleapis.com/youtube/v3/search?key={os.getenv('GOOGLE_APIS_KEY')}&channelId={channel_id}&part=snippet,id&order=date&type=video&maxResults={windowSize}"`
			`if(currentPage != None):`
			`print(f"Fetching page ${currentPage}")`
			`url += f"&pageToken={currentPage}"`

			`req = requests.get(url)`
			`if(req.ok == False):`
			`print("Could not fetch channel_id items!")`
			`exit(1)`

			`response = req.json()`
			`currentPage = response.get('nextPageToken')`
			`if currentPage in pageTokens:`
			`print('All pages iterated and logged!')`
			`done = True`
			`break`

			`for item in response.get('items'):`
			`if 'id' in item and 'videoId' in item.get('id'):`
			`if is_yt_short(item.get('id').get('videoId')):`
			`print(f"Filtering out YT Short {item.get('id').get('videoId')}")`
			`continue`

			`if data.get('channelTitle') is None:`
			`data['channelTitle'] = slugify(item.get('snippet').get('channelTitle'))`

			`newItem = {`
			`'id': item.get('id').get('videoId'),`
			`'url': f"https://youtube.com/watch?v={item.get('id').get('videoId')}",`
			`'title': item.get('snippet').get('title'),`
			`'description': item.get('snippet').get('description'),`
			`'thumbnail': item.get('snippet').get('thumbnails').get('high').get('url'),`
			`'published': item.get('snippet').get('publishTime'),`
			`}`
			`items.append(newItem)`

			`pageTokens.append(currentPage)`

			`data['items'] = items`
			`with open(file_path, 'w+', encoding='utf-8') as json_file:`
			`json.dump(data, json_file, ensure_ascii=True, indent=2)`
			`print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")`

			`return data`