anything-llm/collector/scripts/youtube.py

import os, json
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter, JSONFormatter
from .utils import tokenize, ada_v2_cost
from .yt_utils import fetch_channel_video_information, get_channel_id, clean_text, append_meta, get_duration
from alive_progress import alive_it

# Example Channel URL https://www.youtube.com/channel/UCmWbhBB96ynOZuWG7LfKong
# Example Channel URL https://www.youtube.com/@mintplex

def youtube():
    channel_link = input("Paste in the URL of a YouTube channel: ")
    channel_id = get_channel_id(channel_link)

    if channel_id == None or len(channel_id) == 0:
        print("Invalid input - must be full YouTube channel URL")
        exit(1)

    channel_data = fetch_channel_video_information(channel_id)
    transaction_output_dir = f"../server/storage/documents/youtube-{channel_data.get('channelTitle')}"

    if os.path.isdir(transaction_output_dir) == False:
      os.makedirs(transaction_output_dir)

    print(f"\nFetching transcripts for {len(channel_data.get('items'))} videos - please wait.\nStopping and restarting will not refetch known transcripts in case there is an error.\nSaving results to: {transaction_output_dir}.")
    totalTokenCount = 0
    for video in alive_it(channel_data.get('items')):
      video_file_path = transaction_output_dir + f"/video-{video.get('id')}.json"
      if os.path.exists(video_file_path) == True:
        continue

      formatter = TextFormatter()
      json_formatter = JSONFormatter()
      try:
        transcript = YouTubeTranscriptApi.get_transcript(video.get('id'))
        raw_text = clean_text(formatter.format_transcript(transcript))
        duration = get_duration(json_formatter.format_transcript(transcript))

        if(len(raw_text) > 0):
          fullText = append_meta(video, duration, raw_text)
          tokenCount = len(tokenize(fullText))
          video['pageContent'] = fullText
          video['token_count_estimate'] = tokenCount
          totalTokenCount += tokenCount
          with open(video_file_path, 'w', encoding='utf-8') as file:
            json.dump(video, file, ensure_ascii=True, indent=4)
      except:
        print("There was an issue getting the transcription of a video in the list - likely because captions are disabled. Skipping")
        continue

    print(f"[Success]: {len(channel_data.get('items'))} video transcripts fetched!")
    print(f"\n\n////////////////////////////")
    print(f"Your estimated cost to embed all of this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokenCount)} using {totalTokenCount} tokens.")
    print(f"////////////////////////////\n\n")
    exit(0)