YouTube¶

Resources for using the YouTube API.

About¶

ID	Key	Value
0	Website	YouTube Data API
1	Website	Google Developers Console
2	GitHub	Google API Client: The official Python client library for Google’s discovery based APIs.
3	Tutorial	Playlist: YouTube Data API - Python Tutorials Channel: Python Engineer
4	Reference	Code: How to show (as output cell) the contents of a .py file with syntax highlighting? User: jgosmann
5	Reference	Code: Script display from an external file in Jupyter Notebook with syntax-highlighting User: Miladiouss
6	Reference	Code: How to reset index in a pandas dataframe? User: jezrael
7	Reference	pandas.json_normalize \| pandas >> API reference >> Input/output
8	Reference	pandas.Series.str.contains \| pandas >> API reference >> Series
9	Reference	Code: User:

Data Cleaning Notes¶

Channel title can be different than channel url.¶

Example -

Channel title: JCharisTech
Channel url: https://www.youtube.com/c/JCharisTechJSecur1ty

Channel title: MongoDB
Channel url: https://www.youtube.com/c/MongoDBofficial

There are 3 features that can be used to identify the channel:

forUsername (username)
Channel Title (display name)
channelId
customUrl

References:

Get ChannelID from Youtube Custom URL

Find YouTube channel IDs by custom URLs or user names | GitHub Gist

Obtaining a channel id from a youtube.com/c/xxxx link?

Import Libraries¶

External Libraries¶

import os
import decouple # Strict separation of config from code.
from decouple import AutoConfig
# from pygments import highlight
# from pygments.lexers import PythonLexer
# from pygments.formatters import HtmlFormatter
# from IPython.display import display, HTML
from IPython.display import Markdown as md
import json
import pandas as pd
import csv
# import importlib

Internal Libraries¶

from youtube.youtube_statistics import YTstats

Connect to the YouTube API¶

# Check if API key is available as an environment variable
api_key_name = 'YOUTUBE_API_vCE'
API_KEY = os.environ.get(api_key_name)
if API_KEY is None:
    # Retrieve API key from .env file
    config = AutoConfig(search_path = '.')
    API_KEY = config(api_key_name)

YouTube Channel Statistics¶

channel_id_dict = {
    'Freecodecamp': 'UC8butISFwT-Wl7EV0hUK0BQ',
    'JuliaComputing': 'UCvZxpJZ6_4j63ZWCbxdFzdA',
    'JCharisTech': 'UC2wMHF4HBkTMGLsvZAIWzRg',
    'HeatonResearch': None,
    'datacouncil': None,
    'realpython': 'UCI0vQvr9aFn27yR6Ej6n5UA',
    'KárolyZsolnai': 'UCbfYPyITQ-7l4upoX8nvctg', # Two Minute Papers
    'MongoDBofficial': 'UCK_m2976Yvbx-TyDLw7n1WA', # MongoDB
}

current_channel_key = 'MongoDBofficial'
current_channel_id = channel_id_dict[current_channel_key]

Class: YTStats¶

file_folder = 'youtube'
file_name = 'youtube_statistics' + '.py'
file_path = os.path.join(file_folder, file_name)

with open(file_path) as f:
    code = f.read()

md("```python\n" + code)

# ======================
# Get YouTube Statistics
# ======================

"""
# References

YouTube > Data API > Reference > Channels: list
https://developers.google.com/youtube/v3/docs/channels/list

YouTube > Data API > Reference > Search: list
https://developers.google.com/youtube/v3/docs/search/list

YouTube > Data API > Reference > Video
https://developers.google.com/youtube/v3/docs/videos

"""


## Import Libraries

### External Libraries
import requests
import json
import os.path
# from tqdm import tqdm   # A Fast, Extensible Progress Meter
import inspect

## Data Classes
class YTstats:
    def __init__(self, api_key, channel_id = None, for_username = None):
        self.api_key = api_key
        self.channel_id = channel_id
        self.for_username = for_username
        self.channel_statistics = None
        self.video_data = None
        # self.filename = None
        self.file_path = None

    def convert_username_to_channel_id(self, username):
        get_prefix = 'https://www.googleapis.com/youtube/v3/channels?'
        url = get_prefix + f'key={self.api_key}&forUsername={self.for_username}&part=id'
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        # print(f'channel for_username: {self.for_username}')
        # print(f'channel id data: {data}')
        try:
            channel_id = data['items'][0]['id']
            # print(f'this should be the channel id: {channel_id}')
            self.channel_id = channel_id
        except Exception as e:
            print(f'Error message for func -- {inspect.currentframe().f_code.co_name}:\n\t{e}')
            self.channel_id = 'unknown_channel_id'
        # return channel_id

    def get_channel_statistics(self):
        """_summary_
        TODO: 
            1.  Check for data['error']['code'] == 403
                data['error']['errors'][0]['reason'] == 'quotaExceeded'
        Returns:
            _type_: _description_
        """
        if self.channel_id is None:
            self.convert_username_to_channel_id(self.for_username)
        print(f'Log: getting stats for {self.for_username} (channel id: {self.channel_id})...')
        url = f'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}'
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        # print(data)
        try:
            data = data['items'][0]['statistics']
        except Exception as e:
            print(f'Error message: {e}.\nSet `data` value to None.')
            data = None
        self.channel_statistics = data

        return data

    def get_channel_video_data(self):
        """
        Videos parts:
        1. snippet
            publishedAt             tags
            channelId               categoryId
            title                   liveBroadcastContent
            description             defaultLanguage
            thumbnails              localized
            channelTitle            defaultAudioLanguage
            
        2. statistics
            viewCount               favoriteCount
            likeCount               commentCount
            dislikeCount
            
        3. contentDetails
            duration                regionRestriction
            dimension               contentRating
            definition              projection
            caption                 hasCustomThumbnail
            licensedContent
        """
        # 1. return video ids
        channel_videos = self._get_channel_videos(limit = 10)
        print(f'No. of videos = {len(channel_videos)}\nVideo IDs:\n{channel_videos}')
        # print(f'No. of videos = {len(channel_videos)}')
        # 2. return video statistics
        parts = [
            'snippet', 
            'statistics',
            'contentDetails'
        ]
        for video_id in channel_videos:
            for part in parts:
                data = self._get_single_video_data(video_id, part)
                # print(f'channel videos: {channel_videos[video_id]}')
                """
                if part == 'snippet':
                    print(f"data (video title): {data.get('title', 'No title available')}\n")
                """
                channel_videos[video_id].update(data)

        self.video_data = channel_videos
        return channel_videos

    def _get_single_video_data(self, video_id, part):
        url_prefix = url_prefix = 'https://www.googleapis.com/youtube/v3/videos?'
        url = url_prefix + f'part={part}&id={video_id}&key={self.api_key}'
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        try:
            data = data['items'][0][part]
        except Exception as e:
            print(f'Error message for func -- {inspect.currentframe().f_code.co_name}:\n\t{e}')
            data = {}
        return data

    def _get_channel_videos(self, limit = 50):
        if self.channel_id is None:
            self.channel_id = self.convert_username_to_channel_id(self.for_username)
        get_prefix = 'https://www.googleapis.com/youtube/v3/search?'
        url = get_prefix + f'key={self.api_key}&channelId={self.channel_id}&part=id&order=date'
        if limit is not None and isinstance(limit, int):
            url += '&maxResults=' + str(limit)
        
        videos, next_page_token = self._get_channel_videos_per_page(url)
        page_index = 0
        while (next_page_token is not None and page_index < 2):
            next_url = url + '&pageToken=' + next_page_token
            next_videos, next_page_token = \
                self._get_channel_videos_per_page(next_url)
            videos.update(next_videos)
            page_index += 1
        
        return videos    

    def _get_channel_videos_per_page(self, url):
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        channel_videos = {}
        if 'items' not in data:
            return channel_videos, None

        item_data = data['items']
        next_page_token = data.get('nextPageToken', None)
        for item in item_data:
            try:
                kind = item['id']['kind']
                if kind == 'youtube#video':
                    video_id = item['id']['videoId']
                    channel_videos[video_id] = {}
            except KeyError as ke:
                print(f'Key error: {ke}')
            except Exception as e:
                print(f'Error message: {e}')
        
        return channel_videos, next_page_token

    def dump(self):
        if self.channel_statistics is None or self.video_data is None:
            print(f'Data is None.')
            return None

        youtube_video_data = {
            self.channel_id: {
                'channel_statistics': self.channel_statistics,
                'video_data': self.video_data
            }
        }

        channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
        channel_title = channel_title.replace(' ', '_').lower()
        file_folder = 'youtube'
        file_name = channel_title.lower() + '.json'
        self.file_path = os.path.join(file_folder, file_name)
        with open(self.file_path, 'w') as f:
            json.dump(youtube_video_data, f, indent = 4)

        print(f'Statistics and video data saved to: {self.file_path}.')

Get Statistics¶

if current_channel_id is not None:
    yt = YTstats(API_KEY, channel_id = current_channel_id)
else:
    yt = YTstats(API_KEY, for_username = current_channel_key)
    yt.channel_id = yt.convert_username_to_channel_id(current_channel_key)

print(yt.channel_id)

UCK_m2976Yvbx-TyDLw7n1WA

yt.get_channel_statistics()

Log: getting stats for None (channel id: UCK_m2976Yvbx-TyDLw7n1WA)...

{'viewCount': '24070169',
 'subscriberCount': '56500',
 'hiddenSubscriberCount': False,
 'videoCount': '773'}

channel_videos = yt.get_channel_video_data()

No. of videos = 30
Video IDs:
{'IqioEqWvob0': {}, 'EyZtB3hKsnM': {}, '-7r4whMKt1s': {}, 'bALyYC10ABw': {}, 'zgGui1OFBnE': {}, 'jIIw1dDLbd0': {}, 'aW7cnNa82e4': {}, 'SZnzWfZ9PMI': {}, 'UVJJLc0abzc': {}, 'nEMIDCyi3Os': {}, '_Ky7qVGsg44': {}, 'qZYg09Bf67o': {}, 'bMhGWO4ECMo': {}, 'zDEACh1qFVo': {}, '4gjBt3eZNOw': {}, 'kzzQxlk9bBY': {}, '5OEqxqYIZbs': {}, 'Mf-F9UPKB7s': {}, '1Ooc8uFoa4Q': {}, 'Hgo1vXPHY0I': {}, 'k_PH4Bauc4c': {}, 'KAlbALFd9ZQ': {}, '-ttCul_-nns': {}, 'xUdi6OHnXrc': {}, 'YYYVAzJ-Gws': {}, 'lYTZeOP4YQc': {}, 'aW6poxfSPnU': {}, 'dgvfRbu_5Vw': {}, 'fwKBo6SQWcg': {}, 'V-lgTC_6CrU': {}}

yt.dump()

Statistics and video data saved to: youtube\mongodb.json.

View JSON output¶

file_path = yt.file_path

try:
    with open(file_path) as f:
        # code = f.read()
        data_json = json.load(f)
except FileNotFoundError as fnfe:
    print(f'Channel title, {current_channel_key}, could not be found. Search by channel id.')
    file_name = channel_id_dict[current_channel_key] + '.json'
    file_path = os.path.join(file_folder, file_name)
    with open(file_path) as f:
        # code = f.read()
        data_json = json.load(f)
except Exception as e:
    print(f'Error message: {e}')

# md("```json\n" + code)

channel_id, channel_data = data_json.popitem()
print(f'Channel ID: {channel_id}')
channel_statistics = channel_data['channel_statistics']
video_data = channel_data['video_data']

Channel ID: UCK_m2976Yvbx-TyDLw7n1WA

Transform Data¶

# print(f"Channel Title: {video_data.popitem()[1].get('channelTitle', channel_id)}")
print(f"View Count: {int(channel_statistics['viewCount']):,}")
print(f"Subscriber Count: {int(channel_statistics['subscriberCount']):,}")
print(f"Video Count: {int(channel_statistics['videoCount']):,}")

View Count: 24,070,169
Subscriber Count: 56,500
Video Count: 773

video_id_list = list(video_data.keys())

print(len(video_id_list))

video_df = pd.json_normalize(video_data[video_id_list[0]])

for video_id in video_id_list[1:]:
    next_video_df = pd.json_normalize(video_data[video_id])
    video_df = pd.concat([video_df, next_video_df])

assert len(video_id_list) == len(video_df), f'Video ID count and ' \
    f'Video Data row count do not match.'

video_df.index = pd.RangeIndex(len(video_df.index))

video_df.insert(0, 'video_id', video_id_list)

video_df.head()

	video_id	publishedAt	channelId	title	description	channelTitle	categoryId	liveBroadcastContent	defaultAudioLanguage	viewCount	...	thumbnails.high.height	localized.title	localized.description	tags	thumbnails.standard.url	thumbnails.standard.width	thumbnails.standard.height	thumbnails.maxres.url	thumbnails.maxres.width	thumbnails.maxres.height
0	IqioEqWvob0	2022-05-26T17:01:30Z	UCK_m2976Yvbx-TyDLw7n1WA	Learning with Luce - MongoDB Atlas and Blazor	Join Microsoft MVP's Luce Carter, Developer Ad...	MongoDB	24	none	en-US	176	...	360	Learning with Luce - MongoDB Atlas and Blazor	Join Microsoft MVP's Luce Carter, Developer Ad...	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	EyZtB3hKsnM	2022-05-26T15:02:41Z	UCK_m2976Yvbx-TyDLw7n1WA	Hackathon Submissions Guidelines & Help	With less than 48hrs to go, we’re running a sh...	MongoDB	24	none	en-US	40	...	360	Hackathon Submissions Guidelines & Help	With less than 48hrs to go, we’re running a sh...	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	-7r4whMKt1s	2022-05-26T13:00:12Z	UCK_m2976Yvbx-TyDLw7n1WA	Type Safety with Prisma an Object Relational M...	✅ Sign-up for a free cluster at: https://bit.l...	MongoDB	24	none	en-US	139	...	360	Type Safety with Prisma an Object Relational M...	✅ Sign-up for a free cluster at: https://bit.l...	[MongoDB, TypeScript, DA]	NaN	NaN	NaN	NaN	NaN	NaN
3	bALyYC10ABw	2022-05-24T13:00:14Z	UCK_m2976Yvbx-TyDLw7n1WA	Getting Started with MongoDB & Mongoose ODM (O...	✅ Sign-up for a free cluster at: https://bit.l...	MongoDB	24	none	en-US	920	...	360	Getting Started with MongoDB & Mongoose ODM (O...	✅ Sign-up for a free cluster at: https://bit.l...	[MongoDB, JavaScript, DA]	NaN	NaN	NaN	NaN	NaN	NaN
4	zgGui1OFBnE	2022-05-19T16:36:06Z	UCK_m2976Yvbx-TyDLw7n1WA	Hackathon Office Hours & Demos - USA/EMEA	We are back again with Hackathon Office hours!...	MongoDB	24	none	en-US	221	...	360	Hackathon Office Hours & Demos - USA/EMEA	We are back again with Hackathon Office hours!...	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 37 columns

video_df.tail()

	video_id	publishedAt	channelId	title	description	channelTitle	categoryId	liveBroadcastContent	defaultAudioLanguage	viewCount	...	thumbnails.high.height	localized.title	localized.description	tags	thumbnails.standard.url	thumbnails.standard.width	thumbnails.standard.height	thumbnails.maxres.url	thumbnails.maxres.width	thumbnails.maxres.height
24	YYYVAzJ-Gws	2022-04-13T18:53:20Z	UCK_m2976Yvbx-TyDLw7n1WA	MongoDB World 2022 Hackathon Introduction	We're kicking off the MongoDB World hackathon ...	MongoDB	24	none	en-US	779	...	360	MongoDB World 2022 Hackathon Introduction	We're kicking off the MongoDB World hackathon ...	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25	lYTZeOP4YQc	2022-04-01T17:42:57Z	UCK_m2976Yvbx-TyDLw7n1WA	Using LINQ to Query MongoDB in a .NET Core App...	Learn how to do simple queries and complex agg...	MongoDB	24	none	en-US	778	...	360	Using LINQ to Query MongoDB in a .NET Core App...	Learn how to do simple queries and complex agg...	[mongodb data api, mongodb, mongodb atlas, nos...	NaN	NaN	NaN	NaN	NaN	NaN
26	aW6poxfSPnU	2022-03-31T21:14:16Z	UCK_m2976Yvbx-TyDLw7n1WA	Solutions Architecture at MongoDB	MongoDB's team of Solutions Architects work wi...	MongoDB	24	none	en-US	636	...	360	Solutions Architecture at MongoDB	MongoDB's team of Solutions Architects work wi...	[mongodb data api, mongodb, mongodb atlas, nos...	https://i.ytimg.com/vi/aW6poxfSPnU/sddefault.jpg	640.0	480.0	https://i.ytimg.com/vi/aW6poxfSPnU/maxresdefau...	1280.0	720.0
27	dgvfRbu_5Vw	2022-03-31T16:57:01Z	UCK_m2976Yvbx-TyDLw7n1WA	Learning with Luce: MongoDB and Unity	Learning with Luce is back! To celebrate GDC 2...	MongoDB	24	none	en-US	402	...	360	Learning with Luce: MongoDB and Unity	Learning with Luce is back! To celebrate GDC 2...	NaN	NaN	NaN	NaN	NaN	NaN	NaN
28	fwKBo6SQWcg	2022-03-30T18:31:10Z	UCK_m2976Yvbx-TyDLw7n1WA	Connect MongoDB and Kafka in the Cloud	In this demo, MongoDB Developer Advocate Maxim...	MongoDB	24	none	en-US	541	...	360	Connect MongoDB and Kafka in the Cloud	In this demo, MongoDB Developer Advocate Maxim...	[mongodb data api, mongodb, mongodb atlas, nos...	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 37 columns

video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 video_id                    29 non-null     object 
 publishedAt                 29 non-null     object 
 channelId                   29 non-null     object 
 title                       29 non-null     object 
 description                 29 non-null     object 
 channelTitle                29 non-null     object 
 categoryId                  29 non-null     object 
 liveBroadcastContent        29 non-null     object 
 defaultAudioLanguage        29 non-null     object 
 viewCount                   29 non-null     object 
likeCount                   29 non-null     object 
favoriteCount               29 non-null     object 
commentCount                29 non-null     object 
duration                    29 non-null     object 
dimension                   29 non-null     object 
definition                  29 non-null     object 
caption                     29 non-null     object 
licensedContent             29 non-null     bool   
projection                  29 non-null     object 
thumbnails.default.url      29 non-null     object 
thumbnails.default.width    29 non-null     int64  
thumbnails.default.height   29 non-null     int64  
thumbnails.medium.url       29 non-null     object 
thumbnails.medium.width     29 non-null     int64  
thumbnails.medium.height    29 non-null     int64  
thumbnails.high.url         29 non-null     object 
thumbnails.high.width       29 non-null     int64  
thumbnails.high.height      29 non-null     int64  
localized.title             29 non-null     object 
localized.description       29 non-null     object 
tags                        7 non-null      object 
thumbnails.standard.url     4 non-null      object 
thumbnails.standard.width   4 non-null      float64
thumbnails.standard.height  4 non-null      float64
thumbnails.maxres.url       4 non-null      object 
thumbnails.maxres.width     4 non-null      float64
thumbnails.maxres.height    4 non-null      float64
dtypes: bool(1), float64(4), int64(6), object(26)
memory usage: 8.3+ KB

video_df[video_df['title'].str.contains(
    'neural', 
    case = False,
)]

	video_id	publishedAt	channelId	title	description	channelTitle	categoryId	liveBroadcastContent	defaultAudioLanguage	viewCount	...	thumbnails.high.height	localized.title	localized.description	tags	thumbnails.standard.url	thumbnails.standard.width	thumbnails.standard.height	thumbnails.maxres.url	thumbnails.maxres.width	thumbnails.maxres.height

0 rows × 37 columns

video_df.to_csv(file_path.replace('json', 'csv'), 
    index = False, 
    quoting = csv.QUOTE_ALL,
    sep = '|')

file_path

'youtube\\mongodb.json'

yt.for_username

Simple API Development: Resource for learning about APIs.

YouTube

Contents

YouTube¶

About¶

Data Cleaning Notes¶

Channel title can be different than channel url.¶

Import Libraries¶

External Libraries¶

Internal Libraries¶

Connect to the YouTube API¶

YouTube Channel Statistics¶

Class: YTStats¶

Get Statistics¶

View JSON output¶

Transform Data¶