mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-12-14 23:32:33 +00:00
429 lines
16 KiB
Python
429 lines
16 KiB
Python
|
import itertools
|
||
|
from .common import InfoExtractor, SearchInfoExtractor
|
||
|
from ..utils import (
|
||
|
urljoin,
|
||
|
traverse_obj,
|
||
|
int_or_none,
|
||
|
mimetype2ext,
|
||
|
clean_html,
|
||
|
url_or_none,
|
||
|
unified_timestamp,
|
||
|
str_or_none,
|
||
|
)
|
||
|
|
||
|
|
||
|
class PRXBaseIE(InfoExtractor):
|
||
|
PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
|
||
|
|
||
|
def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
|
||
|
return self._download_json(
|
||
|
urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
|
||
|
|
||
|
@staticmethod
|
||
|
def _get_prx_embed_response(response, section):
|
||
|
return traverse_obj(response, ('_embedded', f'prx:{section}'))
|
||
|
|
||
|
@staticmethod
|
||
|
def _extract_file_link(response):
|
||
|
return url_or_none(traverse_obj(
|
||
|
response, ('_links', 'enclosure', 'href'), expected_type=str))
|
||
|
|
||
|
@classmethod
|
||
|
def _extract_image(cls, image_response):
|
||
|
if not isinstance(image_response, dict):
|
||
|
return
|
||
|
return {
|
||
|
'id': str_or_none(image_response.get('id')),
|
||
|
'filesize': image_response.get('size'),
|
||
|
'width': image_response.get('width'),
|
||
|
'height': image_response.get('height'),
|
||
|
'url': cls._extract_file_link(image_response)
|
||
|
}
|
||
|
|
||
|
@classmethod
|
||
|
def _extract_base_info(cls, response):
|
||
|
if not isinstance(response, dict):
|
||
|
return
|
||
|
item_id = str_or_none(response.get('id'))
|
||
|
if not item_id:
|
||
|
return
|
||
|
thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
|
||
|
description = (
|
||
|
clean_html(response.get('description'))
|
||
|
or response.get('shortDescription'))
|
||
|
return {
|
||
|
'id': item_id,
|
||
|
'title': response.get('title') or item_id,
|
||
|
'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
|
||
|
'description': description,
|
||
|
'release_timestamp': unified_timestamp(response.get('releasedAt')),
|
||
|
'timestamp': unified_timestamp(response.get('createdAt')),
|
||
|
'modified_timestamp': unified_timestamp(response.get('updatedAt')),
|
||
|
'duration': int_or_none(response.get('duration')),
|
||
|
'tags': response.get('tags'),
|
||
|
'episode_number': int_or_none(response.get('episodeIdentifier')),
|
||
|
'season_number': int_or_none(response.get('seasonIdentifier'))
|
||
|
}
|
||
|
|
||
|
@classmethod
|
||
|
def _extract_series_info(cls, series_response):
|
||
|
base_info = cls._extract_base_info(series_response)
|
||
|
if not base_info:
|
||
|
return
|
||
|
account_info = cls._extract_account_info(
|
||
|
cls._get_prx_embed_response(series_response, 'account')) or {}
|
||
|
return {
|
||
|
**base_info,
|
||
|
'channel_id': account_info.get('channel_id'),
|
||
|
'channel_url': account_info.get('channel_url'),
|
||
|
'channel': account_info.get('channel'),
|
||
|
'series': base_info.get('title'),
|
||
|
'series_id': base_info.get('id'),
|
||
|
}
|
||
|
|
||
|
@classmethod
|
||
|
def _extract_account_info(cls, account_response):
|
||
|
base_info = cls._extract_base_info(account_response)
|
||
|
if not base_info:
|
||
|
return
|
||
|
name = account_response.get('name')
|
||
|
return {
|
||
|
**base_info,
|
||
|
'title': name,
|
||
|
'channel_id': base_info.get('id'),
|
||
|
'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
|
||
|
'channel': name,
|
||
|
}
|
||
|
|
||
|
@classmethod
|
||
|
def _extract_story_info(cls, story_response):
|
||
|
base_info = cls._extract_base_info(story_response)
|
||
|
if not base_info:
|
||
|
return
|
||
|
series = cls._extract_series_info(
|
||
|
cls._get_prx_embed_response(story_response, 'series')) or {}
|
||
|
account = cls._extract_account_info(
|
||
|
cls._get_prx_embed_response(story_response, 'account')) or {}
|
||
|
return {
|
||
|
**base_info,
|
||
|
'series': series.get('series'),
|
||
|
'series_id': series.get('series_id'),
|
||
|
'channel_id': account.get('channel_id'),
|
||
|
'channel_url': account.get('channel_url'),
|
||
|
'channel': account.get('channel')
|
||
|
}
|
||
|
|
||
|
def _entries(self, item_id, endpoint, entry_func, query=None):
|
||
|
"""
|
||
|
Extract entries from paginated list API
|
||
|
@param entry_func: Function to generate entry from response item
|
||
|
"""
|
||
|
total = 0
|
||
|
for page in itertools.count(1):
|
||
|
response = self._call_api(f'{item_id}: page {page}', endpoint, query={
|
||
|
**(query or {}),
|
||
|
'page': page,
|
||
|
'per': 100
|
||
|
})
|
||
|
items = self._get_prx_embed_response(response, 'items')
|
||
|
if not response or not items:
|
||
|
break
|
||
|
|
||
|
yield from filter(None, map(entry_func, items))
|
||
|
|
||
|
total += response['count']
|
||
|
if total >= response['total']:
|
||
|
break
|
||
|
|
||
|
def _story_playlist_entry(self, response):
|
||
|
story = self._extract_story_info(response)
|
||
|
if not story:
|
||
|
return
|
||
|
story.update({
|
||
|
'_type': 'url',
|
||
|
'url': 'https://beta.prx.org/stories/%s' % story['id'],
|
||
|
'ie_key': PRXStoryIE.ie_key()
|
||
|
})
|
||
|
return story
|
||
|
|
||
|
def _series_playlist_entry(self, response):
|
||
|
series = self._extract_series_info(response)
|
||
|
if not series:
|
||
|
return
|
||
|
series.update({
|
||
|
'_type': 'url',
|
||
|
'url': 'https://beta.prx.org/series/%s' % series['id'],
|
||
|
'ie_key': PRXSeriesIE.ie_key()
|
||
|
})
|
||
|
return series
|
||
|
|
||
|
|
||
|
class PRXStoryIE(PRXBaseIE):
|
||
|
_VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
|
||
|
|
||
|
_TESTS = [
|
||
|
{
|
||
|
# Story with season and episode details
|
||
|
'url': 'https://beta.prx.org/stories/399200',
|
||
|
'info_dict': {
|
||
|
'id': '399200',
|
||
|
'title': 'Fly Me To The Moon',
|
||
|
'description': 'md5:43230168390b95d3322048d8a56bf2bb',
|
||
|
'release_timestamp': 1640250000,
|
||
|
'timestamp': 1640208972,
|
||
|
'modified_timestamp': 1641318202,
|
||
|
'duration': 1004,
|
||
|
'tags': 'count:7',
|
||
|
'episode_number': 8,
|
||
|
'season_number': 5,
|
||
|
'series': 'AirSpace',
|
||
|
'series_id': '38057',
|
||
|
'channel_id': '220986',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/220986',
|
||
|
'channel': 'Air and Space Museum',
|
||
|
},
|
||
|
'playlist': [{
|
||
|
'info_dict': {
|
||
|
'id': '399200_part1',
|
||
|
'title': 'Fly Me To The Moon',
|
||
|
'description': 'md5:43230168390b95d3322048d8a56bf2bb',
|
||
|
'release_timestamp': 1640250000,
|
||
|
'timestamp': 1640208972,
|
||
|
'modified_timestamp': 1641318202,
|
||
|
'duration': 530,
|
||
|
'tags': 'count:7',
|
||
|
'episode_number': 8,
|
||
|
'season_number': 5,
|
||
|
'series': 'AirSpace',
|
||
|
'series_id': '38057',
|
||
|
'channel_id': '220986',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/220986',
|
||
|
'channel': 'Air and Space Museum',
|
||
|
'ext': 'mp3',
|
||
|
'upload_date': '20211222',
|
||
|
'episode': 'Episode 8',
|
||
|
'release_date': '20211223',
|
||
|
'season': 'Season 5',
|
||
|
'modified_date': '20220104'
|
||
|
}
|
||
|
}, {
|
||
|
'info_dict': {
|
||
|
'id': '399200_part2',
|
||
|
'title': 'Fly Me To The Moon',
|
||
|
'description': 'md5:43230168390b95d3322048d8a56bf2bb',
|
||
|
'release_timestamp': 1640250000,
|
||
|
'timestamp': 1640208972,
|
||
|
'modified_timestamp': 1641318202,
|
||
|
'duration': 474,
|
||
|
'tags': 'count:7',
|
||
|
'episode_number': 8,
|
||
|
'season_number': 5,
|
||
|
'series': 'AirSpace',
|
||
|
'series_id': '38057',
|
||
|
'channel_id': '220986',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/220986',
|
||
|
'channel': 'Air and Space Museum',
|
||
|
'ext': 'mp3',
|
||
|
'upload_date': '20211222',
|
||
|
'episode': 'Episode 8',
|
||
|
'release_date': '20211223',
|
||
|
'season': 'Season 5',
|
||
|
'modified_date': '20220104'
|
||
|
}
|
||
|
}
|
||
|
|
||
|
]
|
||
|
}, {
|
||
|
# Story with only split audio
|
||
|
'url': 'https://beta.prx.org/stories/326414',
|
||
|
'info_dict': {
|
||
|
'id': '326414',
|
||
|
'title': 'Massachusetts v EPA',
|
||
|
'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
|
||
|
'timestamp': 1592509124,
|
||
|
'modified_timestamp': 1592510457,
|
||
|
'duration': 3088,
|
||
|
'tags': 'count:0',
|
||
|
'series': 'Outside/In',
|
||
|
'series_id': '36252',
|
||
|
'channel_id': '206',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/206',
|
||
|
'channel': 'New Hampshire Public Radio',
|
||
|
},
|
||
|
'playlist_count': 4
|
||
|
}, {
|
||
|
# Story with single combined audio
|
||
|
'url': 'https://beta.prx.org/stories/400404',
|
||
|
'info_dict': {
|
||
|
'id': '400404',
|
||
|
'title': 'Cafe Chill (Episode 2022-01)',
|
||
|
'thumbnails': 'count:1',
|
||
|
'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
|
||
|
'timestamp': 1641233952,
|
||
|
'modified_timestamp': 1641234248,
|
||
|
'duration': 3540,
|
||
|
'series': 'Café Chill',
|
||
|
'series_id': '37762',
|
||
|
'channel_id': '5767',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/5767',
|
||
|
'channel': 'C89.5 - KNHC Seattle',
|
||
|
'ext': 'mp3',
|
||
|
'tags': 'count:0',
|
||
|
'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
|
||
|
'upload_date': '20220103',
|
||
|
'modified_date': '20220103'
|
||
|
}
|
||
|
}, {
|
||
|
'url': 'https://listen.prx.org/stories/399200',
|
||
|
'only_matching': True
|
||
|
}
|
||
|
]
|
||
|
|
||
|
def _extract_audio_pieces(self, audio_response):
|
||
|
return [{
|
||
|
'format_id': str_or_none(piece_response.get('id')),
|
||
|
'format_note': str_or_none(piece_response.get('label')),
|
||
|
'filesize': int_or_none(piece_response.get('size')),
|
||
|
'duration': int_or_none(piece_response.get('duration')),
|
||
|
'ext': mimetype2ext(piece_response.get('contentType')),
|
||
|
'asr': int_or_none(piece_response.get('frequency'), scale=1000),
|
||
|
'abr': int_or_none(piece_response.get('bitRate')),
|
||
|
'url': self._extract_file_link(piece_response),
|
||
|
'vcodec': 'none'
|
||
|
} for piece_response in sorted(
|
||
|
self._get_prx_embed_response(audio_response, 'items') or [],
|
||
|
key=lambda p: int_or_none(p.get('position')))]
|
||
|
|
||
|
def _extract_story(self, story_response):
|
||
|
info = self._extract_story_info(story_response)
|
||
|
if not info:
|
||
|
return
|
||
|
audio_pieces = self._extract_audio_pieces(
|
||
|
self._get_prx_embed_response(story_response, 'audio'))
|
||
|
if len(audio_pieces) == 1:
|
||
|
return {
|
||
|
'formats': audio_pieces,
|
||
|
**info
|
||
|
}
|
||
|
|
||
|
entries = [{
|
||
|
**info,
|
||
|
'id': '%s_part%d' % (info['id'], (idx + 1)),
|
||
|
'formats': [fmt],
|
||
|
} for idx, fmt in enumerate(audio_pieces)]
|
||
|
return {
|
||
|
'_type': 'multi_video',
|
||
|
'entries': entries,
|
||
|
**info
|
||
|
}
|
||
|
|
||
|
def _real_extract(self, url):
|
||
|
story_id = self._match_id(url)
|
||
|
response = self._call_api(story_id, f'stories/{story_id}')
|
||
|
return self._extract_story(response)
|
||
|
|
||
|
|
||
|
class PRXSeriesIE(PRXBaseIE):
|
||
|
_VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
|
||
|
_TESTS = [
|
||
|
{
|
||
|
'url': 'https://beta.prx.org/series/36252',
|
||
|
'info_dict': {
|
||
|
'id': '36252',
|
||
|
'title': 'Outside/In',
|
||
|
'thumbnails': 'count:1',
|
||
|
'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
|
||
|
'timestamp': 1470684964,
|
||
|
'modified_timestamp': 1582308830,
|
||
|
'channel_id': '206',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/206',
|
||
|
'channel': 'New Hampshire Public Radio',
|
||
|
'series': 'Outside/In',
|
||
|
'series_id': '36252'
|
||
|
},
|
||
|
'playlist_mincount': 39
|
||
|
}, {
|
||
|
# Blank series
|
||
|
'url': 'https://beta.prx.org/series/25038',
|
||
|
'info_dict': {
|
||
|
'id': '25038',
|
||
|
'title': '25038',
|
||
|
'timestamp': 1207612800,
|
||
|
'modified_timestamp': 1207612800,
|
||
|
'channel_id': '206',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/206',
|
||
|
'channel': 'New Hampshire Public Radio',
|
||
|
'series': '25038',
|
||
|
'series_id': '25038'
|
||
|
},
|
||
|
'playlist_count': 0
|
||
|
}
|
||
|
]
|
||
|
|
||
|
def _extract_series(self, series_response):
|
||
|
info = self._extract_series_info(series_response)
|
||
|
return {
|
||
|
'_type': 'playlist',
|
||
|
'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
|
||
|
**info
|
||
|
}
|
||
|
|
||
|
def _real_extract(self, url):
|
||
|
series_id = self._match_id(url)
|
||
|
response = self._call_api(series_id, f'series/{series_id}')
|
||
|
return self._extract_series(response)
|
||
|
|
||
|
|
||
|
class PRXAccountIE(PRXBaseIE):
|
||
|
_VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
|
||
|
_TESTS = [{
|
||
|
'url': 'https://beta.prx.org/accounts/206',
|
||
|
'info_dict': {
|
||
|
'id': '206',
|
||
|
'title': 'New Hampshire Public Radio',
|
||
|
'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
|
||
|
'channel_id': '206',
|
||
|
'channel_url': 'https://beta.prx.org/accounts/206',
|
||
|
'channel': 'New Hampshire Public Radio',
|
||
|
'thumbnails': 'count:1'
|
||
|
},
|
||
|
'playlist_mincount': 380
|
||
|
}]
|
||
|
|
||
|
def _extract_account(self, account_response):
|
||
|
info = self._extract_account_info(account_response)
|
||
|
series = self._entries(
|
||
|
info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
|
||
|
stories = self._entries(
|
||
|
info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
|
||
|
return {
|
||
|
'_type': 'playlist',
|
||
|
'entries': itertools.chain(series, stories),
|
||
|
**info
|
||
|
}
|
||
|
|
||
|
def _real_extract(self, url):
|
||
|
account_id = self._match_id(url)
|
||
|
response = self._call_api(account_id, f'accounts/{account_id}')
|
||
|
return self._extract_account(response)
|
||
|
|
||
|
|
||
|
class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
|
||
|
IE_DESC = 'PRX Stories Search'
|
||
|
IE_NAME = 'prxstories:search'
|
||
|
_SEARCH_KEY = 'prxstories'
|
||
|
|
||
|
def _search_results(self, query):
|
||
|
yield from self._entries(
|
||
|
f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
|
||
|
|
||
|
|
||
|
class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
|
||
|
IE_DESC = 'PRX Series Search'
|
||
|
IE_NAME = 'prxseries:search'
|
||
|
_SEARCH_KEY = 'prxseries'
|
||
|
|
||
|
def _search_results(self, query):
|
||
|
yield from self._entries(
|
||
|
f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})
|