From 54579be4364e148277c32e20a5c3efc2c3f52f5b Mon Sep 17 00:00:00 2001 From: garret Date: Sat, 11 Nov 2023 19:59:01 +0000 Subject: [PATCH] [ie/nhk] Improve metadata extraction (#8388) Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 102 ++++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index f6b5c501bb..cc3c791741 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -3,6 +3,8 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, + get_element_by_class, int_or_none, join_nonempty, parse_duration, @@ -45,25 +47,36 @@ class NhkBaseIE(InfoExtractor): self.cache.store('nhk', 'api_info', api_info) return api_info - def _extract_formats_and_subtitles(self, vod_id): + def _extract_stream_info(self, vod_id): for refresh in (False, True): api_info = self._get_api_info(refresh) if not api_info: continue api_url = api_info.pop('url') - stream_url = traverse_obj( + meta = traverse_obj( self._download_json( api_url, vod_id, 'Downloading stream url info', fatal=False, query={ **api_info, 'type': 'json', 'optional_id': vod_id, 'active_flg': 1, - }), - ('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) - if stream_url: - return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + }), ('meta', 0)) + stream_url = traverse_obj( + meta, ('movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) + if stream_url: + formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + return { + **traverse_obj(meta, { + 'duration': ('duration', {int_or_none}), + 'timestamp': ('publication_date', {unified_timestamp}), + 'release_timestamp': ('insert_date', {unified_timestamp}), + 'modified_timestamp': ('update_date', {unified_timestamp}), + }), + 'formats': formats, + 'subtitles': subtitles, + } raise ExtractorError('Unable to extract stream url') def _extract_episode_info(self, url, episode=None): @@ -77,11 +90,11 @@ class NhkBaseIE(InfoExtractor): if fetch_episode: episode = self._call_api( episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] - title = episode.get('sub_title_clean') or episode['sub_title'] def get_clean_field(key): - return episode.get(key + '_clean') or episode.get(key) + return clean_html(episode.get(key + '_clean') or episode.get(key)) + title = get_clean_field('sub_title') series = get_clean_field('title') thumbnails = [] @@ -96,22 +109,30 @@ class NhkBaseIE(InfoExtractor): 'url': 'https://www3.nhk.or.jp' + img_path, }) + episode_name = title + if series and title: + title = f'{series} - {title}' + elif series and not title: + title = series + series = None + episode_name = None + else: # title, no series + episode_name = None + info = { 'id': episode_id + '-' + lang, - 'title': '%s - %s' % (series, title) if series and title else title, + 'title': title, 'description': get_clean_field('description'), 'thumbnails': thumbnails, 'series': series, - 'episode': title, + 'episode': episode_name, } + if is_video: vod_id = episode['vod_id'] - formats, subs = self._extract_formats_and_subtitles(vod_id) - info.update({ + **self._extract_stream_info(vod_id), 'id': vod_id, - 'formats': formats, - 'subtitles': subs, }) else: @@ -148,6 +169,14 @@ class NhkVodIE(NhkBaseIE): 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', 'series': 'Japan Railway Journal', + 'modified_timestamp': 1694243656, + 'timestamp': 1681428600, + 'release_timestamp': 1693883728, + 'duration': 1679, + 'upload_date': '20230413', + 'modified_date': '20230909', + 'release_date': '20230905', + }, }, { # video clip @@ -161,6 +190,13 @@ class NhkVodIE(NhkBaseIE): 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', 'series': 'Dining with the Chef', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', + 'duration': 148, + 'upload_date': '20190816', + 'release_date': '20230902', + 'release_timestamp': 1693619292, + 'modified_timestamp': 1694168033, + 'modified_date': '20230908', + 'timestamp': 1565997540, }, }, { # radio @@ -170,7 +206,7 @@ class NhkVodIE(NhkBaseIE): 'ext': 'm4a', 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', 'series': 'Living in Japan', - 'description': 'md5:850611969932874b4a3309e0cae06c2f', + 'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab', 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' }, @@ -212,6 +248,23 @@ class NhkVodIE(NhkBaseIE): 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', }, 'skip': 'expires 2023-10-15', + }, { + # a one-off (single-episode series). title from the api is just '

' + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/3004952/', + 'info_dict': { + 'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552', + 'ext': 'mp4', + 'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island', + 'description': 'md5:5db620c46a0698451cc59add8816b797', + 'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd', + 'release_date': '20230905', + 'timestamp': 1690103400, + 'duration': 2939, + 'release_timestamp': 1693898699, + 'modified_timestamp': 1698057495, + 'modified_date': '20231023', + 'upload_date': '20230723', + }, }] def _real_extract(self, url): @@ -226,13 +279,15 @@ class NhkVodProgramIE(NhkBaseIE): 'info_dict': { 'id': 'sumo', 'title': 'GRAND SUMO Highlights', + 'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf', }, - 'playlist_mincount': 12, + 'playlist_mincount': 0, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', + 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', }, 'playlist_mincount': 12, }, { @@ -241,6 +296,7 @@ class NhkVodProgramIE(NhkBaseIE): 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', + 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', }, 'playlist_mincount': 5, }, { @@ -265,11 +321,11 @@ class NhkVodProgramIE(NhkBaseIE): entries.append(self._extract_episode_info( urljoin(url, episode_path), episode)) - program_title = None - if entries: - program_title = entries[0].get('series') + html = self._download_webpage(url, program_id) + program_title = clean_html(get_element_by_class('p-programDetail__title', html)) + program_description = clean_html(get_element_by_class('p-programDetail__text', html)) - return self.playlist_result(entries, program_id, program_title) + return self.playlist_result(entries, program_id, program_title, program_description) class NhkForSchoolBangumiIE(InfoExtractor): @@ -421,6 +477,7 @@ class NhkRadiruIE(InfoExtractor): 'skip': 'Episode expired on 2023-04-16', 'info_dict': { 'channel': 'NHK-FM', + 'uploader': 'NHK-FM', 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9', 'ext': 'm4a', 'id': '0449_01_3853544', @@ -441,6 +498,7 @@ class NhkRadiruIE(InfoExtractor): 'title': 'ベストオブクラシック', 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', 'channel': 'NHK-FM', + 'uploader': 'NHK-FM', 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', }, 'playlist_mincount': 3, @@ -454,6 +512,7 @@ class NhkRadiruIE(InfoExtractor): 'title': '有島武郎「一房のぶどう」', 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)', 'channel': 'NHKラジオ第1、NHK-FM', + 'uploader': 'NHKラジオ第1、NHK-FM', 'timestamp': 1635757200, 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', 'release_date': '20161207', @@ -469,6 +528,7 @@ class NhkRadiruIE(InfoExtractor): 'id': 'F261_01_3855109', 'ext': 'm4a', 'channel': 'NHKラジオ第1', + 'uploader': 'NHKラジオ第1', 'timestamp': 1681635900, 'release_date': '20230416', 'series': 'NHKラジオニュース', @@ -513,6 +573,7 @@ class NhkRadiruIE(InfoExtractor): series_meta = traverse_obj(meta, { 'title': 'program_name', 'channel': 'media_name', + 'uploader': 'media_name', 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), }, get_all=False) @@ -541,6 +602,7 @@ class NhkRadioNewsPageIE(InfoExtractor): 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d', 'channel': 'NHKラジオ第1', + 'uploader': 'NHKラジオ第1', 'title': 'NHKラジオニュース', } }]