YouTube

Resources for using the YouTube API.

Data Cleaning Notes

Channel title can be different than channel url.

Example -

Channel title: JCharisTech
Channel url: https://www.youtube.com/c/JCharisTechJSecur1ty

Channel title: MongoDB
Channel url: https://www.youtube.com/c/MongoDBofficial

There are 3 features that can be used to identify the channel:

  1. forUsername (username)

  2. Channel Title (display name)

  3. channelId

  4. customUrl

References:

Get ChannelID from Youtube Custom URL

Find YouTube channel IDs by custom URLs or user names | GitHub Gist

Obtaining a channel id from a youtube.com/c/xxxx link?

Import Libraries

External Libraries

import os
import decouple # Strict separation of config from code.
from decouple import AutoConfig
# from pygments import highlight
# from pygments.lexers import PythonLexer
# from pygments.formatters import HtmlFormatter
# from IPython.display import display, HTML
from IPython.display import Markdown as md
import json
import pandas as pd
import csv
# import importlib

Internal Libraries

from youtube.youtube_statistics import YTstats

Connect to the YouTube API

# Check if API key is available as an environment variable
api_key_name = 'YOUTUBE_API_vCE'
API_KEY = os.environ.get(api_key_name)
if API_KEY is None:
    # Retrieve API key from .env file
    config = AutoConfig(search_path = '.')
    API_KEY = config(api_key_name)

YouTube Channel Statistics

channel_id_dict = {
    'Freecodecamp': 'UC8butISFwT-Wl7EV0hUK0BQ',
    'JuliaComputing': 'UCvZxpJZ6_4j63ZWCbxdFzdA',
    'JCharisTech': 'UC2wMHF4HBkTMGLsvZAIWzRg',
    'HeatonResearch': None,
    'datacouncil': None,
    'realpython': 'UCI0vQvr9aFn27yR6Ej6n5UA',
    'KárolyZsolnai': 'UCbfYPyITQ-7l4upoX8nvctg', # Two Minute Papers
    'MongoDBofficial': 'UCK_m2976Yvbx-TyDLw7n1WA', # MongoDB
}
current_channel_key = 'MongoDBofficial'
current_channel_id = channel_id_dict[current_channel_key]

Class: YTStats

file_folder = 'youtube'
file_name = 'youtube_statistics' + '.py'
file_path = os.path.join(file_folder, file_name)

with open(file_path) as f:
    code = f.read()

md("```python\n" + code)
# ======================
# Get YouTube Statistics
# ======================

"""
# References

YouTube > Data API > Reference > Channels: list
https://developers.google.com/youtube/v3/docs/channels/list

YouTube > Data API > Reference > Search: list
https://developers.google.com/youtube/v3/docs/search/list

YouTube > Data API > Reference > Video
https://developers.google.com/youtube/v3/docs/videos

"""


## Import Libraries

### External Libraries
import requests
import json
import os.path
# from tqdm import tqdm   # A Fast, Extensible Progress Meter
import inspect

## Data Classes
class YTstats:
    def __init__(self, api_key, channel_id = None, for_username = None):
        self.api_key = api_key
        self.channel_id = channel_id
        self.for_username = for_username
        self.channel_statistics = None
        self.video_data = None
        # self.filename = None
        self.file_path = None

    def convert_username_to_channel_id(self, username):
        get_prefix = 'https://www.googleapis.com/youtube/v3/channels?'
        url = get_prefix + f'key={self.api_key}&forUsername={self.for_username}&part=id'
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        # print(f'channel for_username: {self.for_username}')
        # print(f'channel id data: {data}')
        try:
            channel_id = data['items'][0]['id']
            # print(f'this should be the channel id: {channel_id}')
            self.channel_id = channel_id
        except Exception as e:
            print(f'Error message for func -- {inspect.currentframe().f_code.co_name}:\n\t{e}')
            self.channel_id = 'unknown_channel_id'
        # return channel_id

    def get_channel_statistics(self):
        """_summary_
        TODO: 
            1.  Check for data['error']['code'] == 403
                data['error']['errors'][0]['reason'] == 'quotaExceeded'
        Returns:
            _type_: _description_
        """
        if self.channel_id is None:
            self.convert_username_to_channel_id(self.for_username)
        print(f'Log: getting stats for {self.for_username} (channel id: {self.channel_id})...')
        url = f'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}'
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        # print(data)
        try:
            data = data['items'][0]['statistics']
        except Exception as e:
            print(f'Error message: {e}.\nSet `data` value to None.')
            data = None
        self.channel_statistics = data

        return data

    def get_channel_video_data(self):
        """
        Videos parts:
        1. snippet
            publishedAt             tags
            channelId               categoryId
            title                   liveBroadcastContent
            description             defaultLanguage
            thumbnails              localized
            channelTitle            defaultAudioLanguage
            
        2. statistics
            viewCount               favoriteCount
            likeCount               commentCount
            dislikeCount
            
        3. contentDetails
            duration                regionRestriction
            dimension               contentRating
            definition              projection
            caption                 hasCustomThumbnail
            licensedContent
        """
        # 1. return video ids
        channel_videos = self._get_channel_videos(limit = 10)
        print(f'No. of videos = {len(channel_videos)}\nVideo IDs:\n{channel_videos}')
        # print(f'No. of videos = {len(channel_videos)}')
        # 2. return video statistics
        parts = [
            'snippet', 
            'statistics',
            'contentDetails'
        ]
        for video_id in channel_videos:
            for part in parts:
                data = self._get_single_video_data(video_id, part)
                # print(f'channel videos: {channel_videos[video_id]}')
                """
                if part == 'snippet':
                    print(f"data (video title): {data.get('title', 'No title available')}\n")
                """
                channel_videos[video_id].update(data)

        self.video_data = channel_videos
        return channel_videos

    def _get_single_video_data(self, video_id, part):
        url_prefix = url_prefix = 'https://www.googleapis.com/youtube/v3/videos?'
        url = url_prefix + f'part={part}&id={video_id}&key={self.api_key}'
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        try:
            data = data['items'][0][part]
        except Exception as e:
            print(f'Error message for func -- {inspect.currentframe().f_code.co_name}:\n\t{e}')
            data = {}
        return data

    def _get_channel_videos(self, limit = 50):
        if self.channel_id is None:
            self.channel_id = self.convert_username_to_channel_id(self.for_username)
        get_prefix = 'https://www.googleapis.com/youtube/v3/search?'
        url = get_prefix + f'key={self.api_key}&channelId={self.channel_id}&part=id&order=date'
        if limit is not None and isinstance(limit, int):
            url += '&maxResults=' + str(limit)
        
        videos, next_page_token = self._get_channel_videos_per_page(url)
        page_index = 0
        while (next_page_token is not None and page_index < 2):
            next_url = url + '&pageToken=' + next_page_token
            next_videos, next_page_token = \
                self._get_channel_videos_per_page(next_url)
            videos.update(next_videos)
            page_index += 1
        
        return videos    

    def _get_channel_videos_per_page(self, url):
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        channel_videos = {}
        if 'items' not in data:
            return channel_videos, None

        item_data = data['items']
        next_page_token = data.get('nextPageToken', None)
        for item in item_data:
            try:
                kind = item['id']['kind']
                if kind == 'youtube#video':
                    video_id = item['id']['videoId']
                    channel_videos[video_id] = {}
            except KeyError as ke:
                print(f'Key error: {ke}')
            except Exception as e:
                print(f'Error message: {e}')
        
        return channel_videos, next_page_token

    def dump(self):
        if self.channel_statistics is None or self.video_data is None:
            print(f'Data is None.')
            return None

        youtube_video_data = {
            self.channel_id: {
                'channel_statistics': self.channel_statistics,
                'video_data': self.video_data
            }
        }

        channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
        channel_title = channel_title.replace(' ', '_').lower()
        file_folder = 'youtube'
        file_name = channel_title.lower() + '.json'
        self.file_path = os.path.join(file_folder, file_name)
        with open(self.file_path, 'w') as f:
            json.dump(youtube_video_data, f, indent = 4)

        print(f'Statistics and video data saved to: {self.file_path}.')

Get Statistics

if current_channel_id is not None:
    yt = YTstats(API_KEY, channel_id = current_channel_id)
else:
    yt = YTstats(API_KEY, for_username = current_channel_key)
    yt.channel_id = yt.convert_username_to_channel_id(current_channel_key)
print(yt.channel_id)
UCK_m2976Yvbx-TyDLw7n1WA
yt.get_channel_statistics()
Log: getting stats for None (channel id: UCK_m2976Yvbx-TyDLw7n1WA)...
{'viewCount': '24070169',
 'subscriberCount': '56500',
 'hiddenSubscriberCount': False,
 'videoCount': '773'}
channel_videos = yt.get_channel_video_data()
No. of videos = 30
Video IDs:
{'IqioEqWvob0': {}, 'EyZtB3hKsnM': {}, '-7r4whMKt1s': {}, 'bALyYC10ABw': {}, 'zgGui1OFBnE': {}, 'jIIw1dDLbd0': {}, 'aW7cnNa82e4': {}, 'SZnzWfZ9PMI': {}, 'UVJJLc0abzc': {}, 'nEMIDCyi3Os': {}, '_Ky7qVGsg44': {}, 'qZYg09Bf67o': {}, 'bMhGWO4ECMo': {}, 'zDEACh1qFVo': {}, '4gjBt3eZNOw': {}, 'kzzQxlk9bBY': {}, '5OEqxqYIZbs': {}, 'Mf-F9UPKB7s': {}, '1Ooc8uFoa4Q': {}, 'Hgo1vXPHY0I': {}, 'k_PH4Bauc4c': {}, 'KAlbALFd9ZQ': {}, '-ttCul_-nns': {}, 'xUdi6OHnXrc': {}, 'YYYVAzJ-Gws': {}, 'lYTZeOP4YQc': {}, 'aW6poxfSPnU': {}, 'dgvfRbu_5Vw': {}, 'fwKBo6SQWcg': {}, 'V-lgTC_6CrU': {}}
yt.dump()
Statistics and video data saved to: youtube\mongodb.json.

View JSON output

file_path = yt.file_path

try:
    with open(file_path) as f:
        # code = f.read()
        data_json = json.load(f)
except FileNotFoundError as fnfe:
    print(f'Channel title, {current_channel_key}, could not be found. Search by channel id.')
    file_name = channel_id_dict[current_channel_key] + '.json'
    file_path = os.path.join(file_folder, file_name)
    with open(file_path) as f:
        # code = f.read()
        data_json = json.load(f)
except Exception as e:
    print(f'Error message: {e}')

# md("```json\n" + code)
channel_id, channel_data = data_json.popitem()
print(f'Channel ID: {channel_id}')
channel_statistics = channel_data['channel_statistics']
video_data = channel_data['video_data']
Channel ID: UCK_m2976Yvbx-TyDLw7n1WA

Transform Data

# print(f"Channel Title: {video_data.popitem()[1].get('channelTitle', channel_id)}")
print(f"View Count: {int(channel_statistics['viewCount']):,}")
print(f"Subscriber Count: {int(channel_statistics['subscriberCount']):,}")
print(f"Video Count: {int(channel_statistics['videoCount']):,}")
View Count: 24,070,169
Subscriber Count: 56,500
Video Count: 773
video_id_list = list(video_data.keys())
print(len(video_id_list))
29
video_df = pd.json_normalize(video_data[video_id_list[0]])
for video_id in video_id_list[1:]:
    next_video_df = pd.json_normalize(video_data[video_id])
    video_df = pd.concat([video_df, next_video_df])
assert len(video_id_list) == len(video_df), f'Video ID count and ' \
    f'Video Data row count do not match.'
video_df.index = pd.RangeIndex(len(video_df.index))
video_df.insert(0, 'video_id', video_id_list)
video_df.head()
video_id publishedAt channelId title description channelTitle categoryId liveBroadcastContent defaultAudioLanguage viewCount ... thumbnails.high.height localized.title localized.description tags thumbnails.standard.url thumbnails.standard.width thumbnails.standard.height thumbnails.maxres.url thumbnails.maxres.width thumbnails.maxres.height
0 IqioEqWvob0 2022-05-26T17:01:30Z UCK_m2976Yvbx-TyDLw7n1WA Learning with Luce - MongoDB Atlas and Blazor Join Microsoft MVP's Luce Carter, Developer Ad... MongoDB 24 none en-US 176 ... 360 Learning with Luce - MongoDB Atlas and Blazor Join Microsoft MVP's Luce Carter, Developer Ad... NaN NaN NaN NaN NaN NaN NaN
1 EyZtB3hKsnM 2022-05-26T15:02:41Z UCK_m2976Yvbx-TyDLw7n1WA Hackathon Submissions Guidelines & Help With less than 48hrs to go, we’re running a sh... MongoDB 24 none en-US 40 ... 360 Hackathon Submissions Guidelines & Help With less than 48hrs to go, we’re running a sh... NaN NaN NaN NaN NaN NaN NaN
2 -7r4whMKt1s 2022-05-26T13:00:12Z UCK_m2976Yvbx-TyDLw7n1WA Type Safety with Prisma an Object Relational M... ✅ Sign-up for a free cluster at: https://bit.l... MongoDB 24 none en-US 139 ... 360 Type Safety with Prisma an Object Relational M... ✅ Sign-up for a free cluster at: https://bit.l... [MongoDB, TypeScript, DA] NaN NaN NaN NaN NaN NaN
3 bALyYC10ABw 2022-05-24T13:00:14Z UCK_m2976Yvbx-TyDLw7n1WA Getting Started with MongoDB & Mongoose ODM (O... ✅ Sign-up for a free cluster at: https://bit.l... MongoDB 24 none en-US 920 ... 360 Getting Started with MongoDB & Mongoose ODM (O... ✅ Sign-up for a free cluster at: https://bit.l... [MongoDB, JavaScript, DA] NaN NaN NaN NaN NaN NaN
4 zgGui1OFBnE 2022-05-19T16:36:06Z UCK_m2976Yvbx-TyDLw7n1WA Hackathon Office Hours & Demos - USA/EMEA We are back again with Hackathon Office hours!... MongoDB 24 none en-US 221 ... 360 Hackathon Office Hours & Demos - USA/EMEA We are back again with Hackathon Office hours!... NaN NaN NaN NaN NaN NaN NaN

5 rows × 37 columns

video_df.tail()
video_id publishedAt channelId title description channelTitle categoryId liveBroadcastContent defaultAudioLanguage viewCount ... thumbnails.high.height localized.title localized.description tags thumbnails.standard.url thumbnails.standard.width thumbnails.standard.height thumbnails.maxres.url thumbnails.maxres.width thumbnails.maxres.height
24 YYYVAzJ-Gws 2022-04-13T18:53:20Z UCK_m2976Yvbx-TyDLw7n1WA MongoDB World 2022 Hackathon Introduction We're kicking off the MongoDB World hackathon ... MongoDB 24 none en-US 779 ... 360 MongoDB World 2022 Hackathon Introduction We're kicking off the MongoDB World hackathon ... NaN NaN NaN NaN NaN NaN NaN
25 lYTZeOP4YQc 2022-04-01T17:42:57Z UCK_m2976Yvbx-TyDLw7n1WA Using LINQ to Query MongoDB in a .NET Core App... Learn how to do simple queries and complex agg... MongoDB 24 none en-US 778 ... 360 Using LINQ to Query MongoDB in a .NET Core App... Learn how to do simple queries and complex agg... [mongodb data api, mongodb, mongodb atlas, nos... NaN NaN NaN NaN NaN NaN
26 aW6poxfSPnU 2022-03-31T21:14:16Z UCK_m2976Yvbx-TyDLw7n1WA Solutions Architecture at MongoDB MongoDB's team of Solutions Architects work wi... MongoDB 24 none en-US 636 ... 360 Solutions Architecture at MongoDB MongoDB's team of Solutions Architects work wi... [mongodb data api, mongodb, mongodb atlas, nos... https://i.ytimg.com/vi/aW6poxfSPnU/sddefault.jpg 640.0 480.0 https://i.ytimg.com/vi/aW6poxfSPnU/maxresdefau... 1280.0 720.0
27 dgvfRbu_5Vw 2022-03-31T16:57:01Z UCK_m2976Yvbx-TyDLw7n1WA Learning with Luce: MongoDB and Unity Learning with Luce is back! To celebrate GDC 2... MongoDB 24 none en-US 402 ... 360 Learning with Luce: MongoDB and Unity Learning with Luce is back! To celebrate GDC 2... NaN NaN NaN NaN NaN NaN NaN
28 fwKBo6SQWcg 2022-03-30T18:31:10Z UCK_m2976Yvbx-TyDLw7n1WA Connect MongoDB and Kafka in the Cloud In this demo, MongoDB Developer Advocate Maxim... MongoDB 24 none en-US 541 ... 360 Connect MongoDB and Kafka in the Cloud In this demo, MongoDB Developer Advocate Maxim... [mongodb data api, mongodb, mongodb atlas, nos... NaN NaN NaN NaN NaN NaN

5 rows × 37 columns

video_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   video_id                    29 non-null     object 
 1   publishedAt                 29 non-null     object 
 2   channelId                   29 non-null     object 
 3   title                       29 non-null     object 
 4   description                 29 non-null     object 
 5   channelTitle                29 non-null     object 
 6   categoryId                  29 non-null     object 
 7   liveBroadcastContent        29 non-null     object 
 8   defaultAudioLanguage        29 non-null     object 
 9   viewCount                   29 non-null     object 
 10  likeCount                   29 non-null     object 
 11  favoriteCount               29 non-null     object 
 12  commentCount                29 non-null     object 
 13  duration                    29 non-null     object 
 14  dimension                   29 non-null     object 
 15  definition                  29 non-null     object 
 16  caption                     29 non-null     object 
 17  licensedContent             29 non-null     bool   
 18  projection                  29 non-null     object 
 19  thumbnails.default.url      29 non-null     object 
 20  thumbnails.default.width    29 non-null     int64  
 21  thumbnails.default.height   29 non-null     int64  
 22  thumbnails.medium.url       29 non-null     object 
 23  thumbnails.medium.width     29 non-null     int64  
 24  thumbnails.medium.height    29 non-null     int64  
 25  thumbnails.high.url         29 non-null     object 
 26  thumbnails.high.width       29 non-null     int64  
 27  thumbnails.high.height      29 non-null     int64  
 28  localized.title             29 non-null     object 
 29  localized.description       29 non-null     object 
 30  tags                        7 non-null      object 
 31  thumbnails.standard.url     4 non-null      object 
 32  thumbnails.standard.width   4 non-null      float64
 33  thumbnails.standard.height  4 non-null      float64
 34  thumbnails.maxres.url       4 non-null      object 
 35  thumbnails.maxres.width     4 non-null      float64
 36  thumbnails.maxres.height    4 non-null      float64
dtypes: bool(1), float64(4), int64(6), object(26)
memory usage: 8.3+ KB
video_df[video_df['title'].str.contains(
    'neural', 
    case = False,
)]
video_id publishedAt channelId title description channelTitle categoryId liveBroadcastContent defaultAudioLanguage viewCount ... thumbnails.high.height localized.title localized.description tags thumbnails.standard.url thumbnails.standard.width thumbnails.standard.height thumbnails.maxres.url thumbnails.maxres.width thumbnails.maxres.height

0 rows × 37 columns

video_df.to_csv(file_path.replace('json', 'csv'), 
    index = False, 
    quoting = csv.QUOTE_ALL,
    sep = '|')
file_path
'youtube\\mongodb.json'
yt.for_username