From 7666b93604b97e9ada981c6b04ccf5605dd1bd44 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Fri, 14 Apr 2023 07:58:36 +0000 Subject: [PATCH] [extractor/youtube] Define strict uploader metadata mapping (#6384) New mapping: ``` channel -> channel name channel_id -> UCID channel_url -> UCID channel url uploader -> channel name (same as channel field) uploader_id -> @handle uploader_url -> @handle channel url ``` Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 709 +++++++++++++++++++----------------- 1 file changed, 371 insertions(+), 338 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d6a55e9532..2b17751e5e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -458,6 +458,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en + _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' + + def ucid_or_none(self, ucid): + return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) + + def handle_or_none(self, handle): + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None) + + def handle_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', + url, 'channel handle', default=None) + + def ucid_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', + url, 'channel id', default=None) + @functools.cached_property def _preferred_lang(self): """ @@ -992,6 +1009,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not channel_id: channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId')) + channel_id = self.ucid_or_none(channel_id) + overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) @@ -1233,9 +1252,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -1254,7 +1270,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'start_time': 1, 'end_time': 9, 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Philipp Hagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader_id': '@PhilippHagemeister', } }, { @@ -1266,9 +1285,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120608', 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'age_limit': 18, }, 'skip': 'Private video', @@ -1280,9 +1296,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -1299,7 +1312,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'age_limit': 0, 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Philipp Hagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader_id': '@PhilippHagemeister', }, 'params': { 'skip_download': True, @@ -1312,10 +1328,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', - 'uploader': '8KVIDEO', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -1333,8 +1346,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', 'abr': 129.495, 'like_count': int, @@ -1346,13 +1357,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'thumbnail': 'https://i.ytimg.com/vi_webp/IB3lcPjvWLA/maxresdefault.webp', 'channel': 'Afrojack', - 'uploader_url': 'http://www.youtube.com/user/AfrojackVEVO', 'tags': 'count:19', 'availability': 'public', 'categories': ['Music'], 'age_limit': 0, 'alt_title': 'The Spark', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Afrojack', + 'uploader_url': 'https://www.youtube.com/@Afrojack', + 'uploader_id': '@Afrojack', }, 'params': { 'youtube_include_dash_manifest': True, @@ -1369,9 +1382,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'age_limit': 18, 'categories': ['Gaming'], @@ -1385,7 +1395,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'UCzybXLxv08IApdjdN0mJhEg', 'playable_in_embed': True, 'view_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'The Witcher', + 'uploader_url': 'https://www.youtube.com/@thewitcher', + 'uploader_id': '@thewitcher', }, }, { @@ -1397,12 +1410,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Godzilla 2 (Official Video)', 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'upload_date': '20200408', - 'uploader_id': 'FlyingKitty900', - 'uploader': 'FlyingKitty', 'age_limit': 18, 'availability': 'needs_auth', 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', - 'uploader_url': 'http://www.youtube.com/user/FlyingKitty900', 'channel': 'FlyingKitty', 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', 'view_count': int, @@ -1413,7 +1423,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'duration': 177, 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'FlyingKitty', + 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', + 'uploader_id': '@FlyingKitty900', }, }, { @@ -1424,13 +1437,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', 'ext': 'mp4', 'upload_date': '20191228', - 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', - 'uploader': 'Projekt Melody', 'description': 'md5:17eccca93a786d51bc67646756894066', 'age_limit': 18, 'like_count': int, 'availability': 'needs_auth', - 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'view_count': int, 'thumbnail': 'https://i.ytimg.com/vi_webp/Tq92D6wQ1mg/sddefault.webp', @@ -1442,7 +1452,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 106, 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Projekt Melody', + 'uploader_url': 'https://www.youtube.com/@ProjektMelody', + 'uploader_id': '@ProjektMelody', }, }, { @@ -1452,8 +1465,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'MeJVWBSsPAY', 'ext': 'mp4', 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', - 'uploader': 'Herr Lurik', - 'uploader_id': 'st3in234', 'description': 'Fan Video. Music & Lyrics by OOMPH!.', 'upload_date': '20130730', 'track': 'Such mich find mich', @@ -1470,11 +1481,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA', 'categories': ['Music'], 'availability': 'public', - 'uploader_url': 'http://www.youtube.com/user/st3in234', 'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA', 'live_status': 'not_live', 'artist': 'OOMPH!', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Herr Lurik', + 'uploader_url': 'https://www.youtube.com/@HerrLurik', + 'uploader_id': '@HerrLurik', }, }, { @@ -1491,11 +1504,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 266, 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', - 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', 'availability': 'public', @@ -1513,7 +1523,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ', 'categories': ['Music'], 'album': 'Some Chords', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'deadmau5', + 'uploader_url': 'https://www.youtube.com/@deadmau5', + 'uploader_id': '@deadmau5', }, 'expected_warnings': [ 'DASH manifest missing', @@ -1527,10 +1540,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 6085, 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', 'description': 'md5:04bbbf3ccceb6795947572ca36f45904', - 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', 'like_count': int, 'release_timestamp': 1343767800, @@ -1546,7 +1556,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'was_live', 'view_count': int, 'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Olympics', + 'uploader_url': 'https://www.youtube.com/@Olympics', + 'uploader_id': '@Olympics', }, 'params': { 'skip_download': 'requires avconv', @@ -1561,10 +1574,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'stretched_ratio': 16 / 9., 'duration': 85, 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', 'playable_in_embed': True, 'channel': '孫ᄋᄅ', @@ -1579,7 +1589,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'availability': 'unlisted', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': '孫ᄋᄅ', + 'uploader_url': 'https://www.youtube.com/@AllenMeow', + 'uploader_id': '@AllenMeow', }, }, # url_encoded_fmt_stream_map is empty string @@ -1591,8 +1604,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'description': '', 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', }, 'params': { 'skip_download': 'requires avconv', @@ -1609,9 +1620,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', 'formats': 'mincount:31', }, 'skip': 'not actual anymore', @@ -1624,9 +1632,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'CsmdDsKjzN8', 'ext': 'mp4', 'upload_date': '20150501', # According to ' 1 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', - 'thumbnails': list + 'thumbnails': list, + 'uploader_id': '@kurzgesagt', + 'uploader_url': 'https://www.youtube.com/@kurzgesagt', + 'uploader': 'Kurzgesagt – In a Nutshell', } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6989,11 +7021,12 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'title': 'Mobile Games on Console - Scott The Woz', 'upload_date': '20210920', 'uploader': 'Scott The Woz', - 'uploader_id': 'scottthewoz', - 'uploader_url': 'http://www.youtube.com/user/scottthewoz', + 'uploader_id': '@ScottTheWoz', + 'uploader_url': 'https://www.youtube.com/@ScottTheWoz', 'view_count': int, 'live_status': 'not_live', - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': 'count:20', } }] @@ -7031,13 +7064,13 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'id': 'qVv6vCqciTM', 'ext': 'mp4', 'age_limit': 0, - 'uploader_id': 'UCIdEIHpS0TdkqRkHL5OkLtA', + 'uploader_id': '@sana_natori', 'comment_count': int, 'chapters': 'count:13', 'upload_date': '20221223', 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg', 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', - 'uploader_url': 'http://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', + 'uploader_url': 'https://www.youtube.com/@sana_natori', 'like_count': int, 'release_date': '20221223', 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'],