YouTube
Contents
YouTube¶
Resources for using the YouTube API.
About¶
ID |
Key |
Value |
|---|---|---|
0 |
Website |
|
1 |
Website |
|
2 |
GitHub |
Google API Client: The official Python client library for Google’s discovery based APIs. |
3 |
Tutorial |
Playlist: YouTube Data API - Python Tutorials |
4 |
Reference |
Code: How to show (as output cell) the contents of a .py file with syntax highlighting? |
5 |
Reference |
Code: Script display from an external file in Jupyter Notebook with syntax-highlighting |
6 |
Reference |
Code: How to reset index in a pandas dataframe? |
7 |
Reference |
pandas.json_normalize | pandas >> API reference >> Input/output |
8 |
Reference |
pandas.Series.str.contains | pandas >> API reference >> Series |
9 |
Reference |
Code: |
Data Cleaning Notes¶
Channel title can be different than channel url.¶
Example -
Channel title: JCharisTech
Channel url: https://www.youtube.com/c/JCharisTechJSecur1ty
Channel title: MongoDB
Channel url: https://www.youtube.com/c/MongoDBofficial
There are 3 features that can be used to identify the channel:
forUsername (username)
Channel Title (display name)
channelId
customUrl
References:
Get ChannelID from Youtube Custom URL
Find YouTube channel IDs by custom URLs or user names | GitHub Gist
Import Libraries¶
External Libraries¶
import os
import decouple # Strict separation of config from code.
from decouple import AutoConfig
# from pygments import highlight
# from pygments.lexers import PythonLexer
# from pygments.formatters import HtmlFormatter
# from IPython.display import display, HTML
from IPython.display import Markdown as md
import json
import pandas as pd
import csv
# import importlib
Internal Libraries¶
from youtube.youtube_statistics import YTstats
Connect to the YouTube API¶
# Check if API key is available as an environment variable
api_key_name = 'YOUTUBE_API_vCE'
API_KEY = os.environ.get(api_key_name)
if API_KEY is None:
# Retrieve API key from .env file
config = AutoConfig(search_path = '.')
API_KEY = config(api_key_name)
YouTube Channel Statistics¶
channel_id_dict = {
'Freecodecamp': 'UC8butISFwT-Wl7EV0hUK0BQ',
'JuliaComputing': 'UCvZxpJZ6_4j63ZWCbxdFzdA',
'JCharisTech': 'UC2wMHF4HBkTMGLsvZAIWzRg',
'HeatonResearch': None,
'datacouncil': None,
'realpython': 'UCI0vQvr9aFn27yR6Ej6n5UA',
'KárolyZsolnai': 'UCbfYPyITQ-7l4upoX8nvctg', # Two Minute Papers
'MongoDBofficial': 'UCK_m2976Yvbx-TyDLw7n1WA', # MongoDB
}
current_channel_key = 'MongoDBofficial'
current_channel_id = channel_id_dict[current_channel_key]
Class: YTStats¶
file_folder = 'youtube'
file_name = 'youtube_statistics' + '.py'
file_path = os.path.join(file_folder, file_name)
with open(file_path) as f:
code = f.read()
md("```python\n" + code)
# ======================
# Get YouTube Statistics
# ======================
"""
# References
YouTube > Data API > Reference > Channels: list
https://developers.google.com/youtube/v3/docs/channels/list
YouTube > Data API > Reference > Search: list
https://developers.google.com/youtube/v3/docs/search/list
YouTube > Data API > Reference > Video
https://developers.google.com/youtube/v3/docs/videos
"""
## Import Libraries
### External Libraries
import requests
import json
import os.path
# from tqdm import tqdm # A Fast, Extensible Progress Meter
import inspect
## Data Classes
class YTstats:
def __init__(self, api_key, channel_id = None, for_username = None):
self.api_key = api_key
self.channel_id = channel_id
self.for_username = for_username
self.channel_statistics = None
self.video_data = None
# self.filename = None
self.file_path = None
def convert_username_to_channel_id(self, username):
get_prefix = 'https://www.googleapis.com/youtube/v3/channels?'
url = get_prefix + f'key={self.api_key}&forUsername={self.for_username}&part=id'
json_url = requests.get(url)
data = json.loads(json_url.text)
# print(f'channel for_username: {self.for_username}')
# print(f'channel id data: {data}')
try:
channel_id = data['items'][0]['id']
# print(f'this should be the channel id: {channel_id}')
self.channel_id = channel_id
except Exception as e:
print(f'Error message for func -- {inspect.currentframe().f_code.co_name}:\n\t{e}')
self.channel_id = 'unknown_channel_id'
# return channel_id
def get_channel_statistics(self):
"""_summary_
TODO:
1. Check for data['error']['code'] == 403
data['error']['errors'][0]['reason'] == 'quotaExceeded'
Returns:
_type_: _description_
"""
if self.channel_id is None:
self.convert_username_to_channel_id(self.for_username)
print(f'Log: getting stats for {self.for_username} (channel id: {self.channel_id})...')
url = f'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}'
json_url = requests.get(url)
data = json.loads(json_url.text)
# print(data)
try:
data = data['items'][0]['statistics']
except Exception as e:
print(f'Error message: {e}.\nSet `data` value to None.')
data = None
self.channel_statistics = data
return data
def get_channel_video_data(self):
"""
Videos parts:
1. snippet
publishedAt tags
channelId categoryId
title liveBroadcastContent
description defaultLanguage
thumbnails localized
channelTitle defaultAudioLanguage
2. statistics
viewCount favoriteCount
likeCount commentCount
dislikeCount
3. contentDetails
duration regionRestriction
dimension contentRating
definition projection
caption hasCustomThumbnail
licensedContent
"""
# 1. return video ids
channel_videos = self._get_channel_videos(limit = 10)
print(f'No. of videos = {len(channel_videos)}\nVideo IDs:\n{channel_videos}')
# print(f'No. of videos = {len(channel_videos)}')
# 2. return video statistics
parts = [
'snippet',
'statistics',
'contentDetails'
]
for video_id in channel_videos:
for part in parts:
data = self._get_single_video_data(video_id, part)
# print(f'channel videos: {channel_videos[video_id]}')
"""
if part == 'snippet':
print(f"data (video title): {data.get('title', 'No title available')}\n")
"""
channel_videos[video_id].update(data)
self.video_data = channel_videos
return channel_videos
def _get_single_video_data(self, video_id, part):
url_prefix = url_prefix = 'https://www.googleapis.com/youtube/v3/videos?'
url = url_prefix + f'part={part}&id={video_id}&key={self.api_key}'
json_url = requests.get(url)
data = json.loads(json_url.text)
try:
data = data['items'][0][part]
except Exception as e:
print(f'Error message for func -- {inspect.currentframe().f_code.co_name}:\n\t{e}')
data = {}
return data
def _get_channel_videos(self, limit = 50):
if self.channel_id is None:
self.channel_id = self.convert_username_to_channel_id(self.for_username)
get_prefix = 'https://www.googleapis.com/youtube/v3/search?'
url = get_prefix + f'key={self.api_key}&channelId={self.channel_id}&part=id&order=date'
if limit is not None and isinstance(limit, int):
url += '&maxResults=' + str(limit)
videos, next_page_token = self._get_channel_videos_per_page(url)
page_index = 0
while (next_page_token is not None and page_index < 2):
next_url = url + '&pageToken=' + next_page_token
next_videos, next_page_token = \
self._get_channel_videos_per_page(next_url)
videos.update(next_videos)
page_index += 1
return videos
def _get_channel_videos_per_page(self, url):
json_url = requests.get(url)
data = json.loads(json_url.text)
channel_videos = {}
if 'items' not in data:
return channel_videos, None
item_data = data['items']
next_page_token = data.get('nextPageToken', None)
for item in item_data:
try:
kind = item['id']['kind']
if kind == 'youtube#video':
video_id = item['id']['videoId']
channel_videos[video_id] = {}
except KeyError as ke:
print(f'Key error: {ke}')
except Exception as e:
print(f'Error message: {e}')
return channel_videos, next_page_token
def dump(self):
if self.channel_statistics is None or self.video_data is None:
print(f'Data is None.')
return None
youtube_video_data = {
self.channel_id: {
'channel_statistics': self.channel_statistics,
'video_data': self.video_data
}
}
channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
channel_title = channel_title.replace(' ', '_').lower()
file_folder = 'youtube'
file_name = channel_title.lower() + '.json'
self.file_path = os.path.join(file_folder, file_name)
with open(self.file_path, 'w') as f:
json.dump(youtube_video_data, f, indent = 4)
print(f'Statistics and video data saved to: {self.file_path}.')
Get Statistics¶
if current_channel_id is not None:
yt = YTstats(API_KEY, channel_id = current_channel_id)
else:
yt = YTstats(API_KEY, for_username = current_channel_key)
yt.channel_id = yt.convert_username_to_channel_id(current_channel_key)
print(yt.channel_id)
UCK_m2976Yvbx-TyDLw7n1WA
yt.get_channel_statistics()
Log: getting stats for None (channel id: UCK_m2976Yvbx-TyDLw7n1WA)...
{'viewCount': '24070169',
'subscriberCount': '56500',
'hiddenSubscriberCount': False,
'videoCount': '773'}
channel_videos = yt.get_channel_video_data()
No. of videos = 30
Video IDs:
{'IqioEqWvob0': {}, 'EyZtB3hKsnM': {}, '-7r4whMKt1s': {}, 'bALyYC10ABw': {}, 'zgGui1OFBnE': {}, 'jIIw1dDLbd0': {}, 'aW7cnNa82e4': {}, 'SZnzWfZ9PMI': {}, 'UVJJLc0abzc': {}, 'nEMIDCyi3Os': {}, '_Ky7qVGsg44': {}, 'qZYg09Bf67o': {}, 'bMhGWO4ECMo': {}, 'zDEACh1qFVo': {}, '4gjBt3eZNOw': {}, 'kzzQxlk9bBY': {}, '5OEqxqYIZbs': {}, 'Mf-F9UPKB7s': {}, '1Ooc8uFoa4Q': {}, 'Hgo1vXPHY0I': {}, 'k_PH4Bauc4c': {}, 'KAlbALFd9ZQ': {}, '-ttCul_-nns': {}, 'xUdi6OHnXrc': {}, 'YYYVAzJ-Gws': {}, 'lYTZeOP4YQc': {}, 'aW6poxfSPnU': {}, 'dgvfRbu_5Vw': {}, 'fwKBo6SQWcg': {}, 'V-lgTC_6CrU': {}}
yt.dump()
Statistics and video data saved to: youtube\mongodb.json.
View JSON output¶
file_path = yt.file_path
try:
with open(file_path) as f:
# code = f.read()
data_json = json.load(f)
except FileNotFoundError as fnfe:
print(f'Channel title, {current_channel_key}, could not be found. Search by channel id.')
file_name = channel_id_dict[current_channel_key] + '.json'
file_path = os.path.join(file_folder, file_name)
with open(file_path) as f:
# code = f.read()
data_json = json.load(f)
except Exception as e:
print(f'Error message: {e}')
# md("```json\n" + code)
channel_id, channel_data = data_json.popitem()
print(f'Channel ID: {channel_id}')
channel_statistics = channel_data['channel_statistics']
video_data = channel_data['video_data']
Channel ID: UCK_m2976Yvbx-TyDLw7n1WA
Transform Data¶
# print(f"Channel Title: {video_data.popitem()[1].get('channelTitle', channel_id)}")
print(f"View Count: {int(channel_statistics['viewCount']):,}")
print(f"Subscriber Count: {int(channel_statistics['subscriberCount']):,}")
print(f"Video Count: {int(channel_statistics['videoCount']):,}")
View Count: 24,070,169
Subscriber Count: 56,500
Video Count: 773
video_id_list = list(video_data.keys())
print(len(video_id_list))
29
video_df = pd.json_normalize(video_data[video_id_list[0]])
for video_id in video_id_list[1:]:
next_video_df = pd.json_normalize(video_data[video_id])
video_df = pd.concat([video_df, next_video_df])
assert len(video_id_list) == len(video_df), f'Video ID count and ' \
f'Video Data row count do not match.'
video_df.index = pd.RangeIndex(len(video_df.index))
video_df.insert(0, 'video_id', video_id_list)
video_df.head()
| video_id | publishedAt | channelId | title | description | channelTitle | categoryId | liveBroadcastContent | defaultAudioLanguage | viewCount | ... | thumbnails.high.height | localized.title | localized.description | tags | thumbnails.standard.url | thumbnails.standard.width | thumbnails.standard.height | thumbnails.maxres.url | thumbnails.maxres.width | thumbnails.maxres.height | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IqioEqWvob0 | 2022-05-26T17:01:30Z | UCK_m2976Yvbx-TyDLw7n1WA | Learning with Luce - MongoDB Atlas and Blazor | Join Microsoft MVP's Luce Carter, Developer Ad... | MongoDB | 24 | none | en-US | 176 | ... | 360 | Learning with Luce - MongoDB Atlas and Blazor | Join Microsoft MVP's Luce Carter, Developer Ad... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | EyZtB3hKsnM | 2022-05-26T15:02:41Z | UCK_m2976Yvbx-TyDLw7n1WA | Hackathon Submissions Guidelines & Help | With less than 48hrs to go, we’re running a sh... | MongoDB | 24 | none | en-US | 40 | ... | 360 | Hackathon Submissions Guidelines & Help | With less than 48hrs to go, we’re running a sh... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | -7r4whMKt1s | 2022-05-26T13:00:12Z | UCK_m2976Yvbx-TyDLw7n1WA | Type Safety with Prisma an Object Relational M... | ✅ Sign-up for a free cluster at: https://bit.l... | MongoDB | 24 | none | en-US | 139 | ... | 360 | Type Safety with Prisma an Object Relational M... | ✅ Sign-up for a free cluster at: https://bit.l... | [MongoDB, TypeScript, DA] | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | bALyYC10ABw | 2022-05-24T13:00:14Z | UCK_m2976Yvbx-TyDLw7n1WA | Getting Started with MongoDB & Mongoose ODM (O... | ✅ Sign-up for a free cluster at: https://bit.l... | MongoDB | 24 | none | en-US | 920 | ... | 360 | Getting Started with MongoDB & Mongoose ODM (O... | ✅ Sign-up for a free cluster at: https://bit.l... | [MongoDB, JavaScript, DA] | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | zgGui1OFBnE | 2022-05-19T16:36:06Z | UCK_m2976Yvbx-TyDLw7n1WA | Hackathon Office Hours & Demos - USA/EMEA | We are back again with Hackathon Office hours!... | MongoDB | 24 | none | en-US | 221 | ... | 360 | Hackathon Office Hours & Demos - USA/EMEA | We are back again with Hackathon Office hours!... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 37 columns
video_df.tail()
| video_id | publishedAt | channelId | title | description | channelTitle | categoryId | liveBroadcastContent | defaultAudioLanguage | viewCount | ... | thumbnails.high.height | localized.title | localized.description | tags | thumbnails.standard.url | thumbnails.standard.width | thumbnails.standard.height | thumbnails.maxres.url | thumbnails.maxres.width | thumbnails.maxres.height | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 24 | YYYVAzJ-Gws | 2022-04-13T18:53:20Z | UCK_m2976Yvbx-TyDLw7n1WA | MongoDB World 2022 Hackathon Introduction | We're kicking off the MongoDB World hackathon ... | MongoDB | 24 | none | en-US | 779 | ... | 360 | MongoDB World 2022 Hackathon Introduction | We're kicking off the MongoDB World hackathon ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 25 | lYTZeOP4YQc | 2022-04-01T17:42:57Z | UCK_m2976Yvbx-TyDLw7n1WA | Using LINQ to Query MongoDB in a .NET Core App... | Learn how to do simple queries and complex agg... | MongoDB | 24 | none | en-US | 778 | ... | 360 | Using LINQ to Query MongoDB in a .NET Core App... | Learn how to do simple queries and complex agg... | [mongodb data api, mongodb, mongodb atlas, nos... | NaN | NaN | NaN | NaN | NaN | NaN |
| 26 | aW6poxfSPnU | 2022-03-31T21:14:16Z | UCK_m2976Yvbx-TyDLw7n1WA | Solutions Architecture at MongoDB | MongoDB's team of Solutions Architects work wi... | MongoDB | 24 | none | en-US | 636 | ... | 360 | Solutions Architecture at MongoDB | MongoDB's team of Solutions Architects work wi... | [mongodb data api, mongodb, mongodb atlas, nos... | https://i.ytimg.com/vi/aW6poxfSPnU/sddefault.jpg | 640.0 | 480.0 | https://i.ytimg.com/vi/aW6poxfSPnU/maxresdefau... | 1280.0 | 720.0 |
| 27 | dgvfRbu_5Vw | 2022-03-31T16:57:01Z | UCK_m2976Yvbx-TyDLw7n1WA | Learning with Luce: MongoDB and Unity | Learning with Luce is back! To celebrate GDC 2... | MongoDB | 24 | none | en-US | 402 | ... | 360 | Learning with Luce: MongoDB and Unity | Learning with Luce is back! To celebrate GDC 2... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 28 | fwKBo6SQWcg | 2022-03-30T18:31:10Z | UCK_m2976Yvbx-TyDLw7n1WA | Connect MongoDB and Kafka in the Cloud | In this demo, MongoDB Developer Advocate Maxim... | MongoDB | 24 | none | en-US | 541 | ... | 360 | Connect MongoDB and Kafka in the Cloud | In this demo, MongoDB Developer Advocate Maxim... | [mongodb data api, mongodb, mongodb atlas, nos... | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 37 columns
video_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 37 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 video_id 29 non-null object
1 publishedAt 29 non-null object
2 channelId 29 non-null object
3 title 29 non-null object
4 description 29 non-null object
5 channelTitle 29 non-null object
6 categoryId 29 non-null object
7 liveBroadcastContent 29 non-null object
8 defaultAudioLanguage 29 non-null object
9 viewCount 29 non-null object
10 likeCount 29 non-null object
11 favoriteCount 29 non-null object
12 commentCount 29 non-null object
13 duration 29 non-null object
14 dimension 29 non-null object
15 definition 29 non-null object
16 caption 29 non-null object
17 licensedContent 29 non-null bool
18 projection 29 non-null object
19 thumbnails.default.url 29 non-null object
20 thumbnails.default.width 29 non-null int64
21 thumbnails.default.height 29 non-null int64
22 thumbnails.medium.url 29 non-null object
23 thumbnails.medium.width 29 non-null int64
24 thumbnails.medium.height 29 non-null int64
25 thumbnails.high.url 29 non-null object
26 thumbnails.high.width 29 non-null int64
27 thumbnails.high.height 29 non-null int64
28 localized.title 29 non-null object
29 localized.description 29 non-null object
30 tags 7 non-null object
31 thumbnails.standard.url 4 non-null object
32 thumbnails.standard.width 4 non-null float64
33 thumbnails.standard.height 4 non-null float64
34 thumbnails.maxres.url 4 non-null object
35 thumbnails.maxres.width 4 non-null float64
36 thumbnails.maxres.height 4 non-null float64
dtypes: bool(1), float64(4), int64(6), object(26)
memory usage: 8.3+ KB
video_df[video_df['title'].str.contains(
'neural',
case = False,
)]
| video_id | publishedAt | channelId | title | description | channelTitle | categoryId | liveBroadcastContent | defaultAudioLanguage | viewCount | ... | thumbnails.high.height | localized.title | localized.description | tags | thumbnails.standard.url | thumbnails.standard.width | thumbnails.standard.height | thumbnails.maxres.url | thumbnails.maxres.width | thumbnails.maxres.height |
|---|
0 rows × 37 columns
video_df.to_csv(file_path.replace('json', 'csv'),
index = False,
quoting = csv.QUOTE_ALL,
sep = '|')
file_path
'youtube\\mongodb.json'
yt.for_username