[callin] Add extractor (#2000)

Authored by: foghawk
This commit is contained in:
foghawk 2022-01-07 04:19:15 -06:00 committed by GitHub
parent 6f32a0b5b7
commit 97a6b117d9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 115 additions and 0 deletions

114
yt_dlp/extractor/callin.py Normal file
View file

@ -0,0 +1,114 @@
# coding: utf-8
from .common import InfoExtractor
from ..utils import (
traverse_obj,
float_or_none,
int_or_none
)
class CallinIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
_TESTS = [{
'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
'info_dict': {
'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd',
'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
'ext': 'ts',
'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
'thumbnail': 're:https://.+\\.png',
'description': 'First episode',
'uploader': 'Wesley Yang',
'timestamp': 1639404128.65,
'upload_date': '20211213',
'uploader_id': 'wesyang',
'uploader_url': 'http://wesleyyang.substack.com',
'channel': 'Conversations in Year Zero',
'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx',
'duration': 9951.936,
'view_count': int,
'categories': ['News & Politics', 'History', 'Technology'],
'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'],
'series': 'Conversations in Year Zero',
'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
'episode_number': 1,
'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
}
}]
def try_get_user_name(self, d):
names = [d.get(n) for n in ('first', 'last')]
if None in names:
return next((n for n in names if n), default=None)
return ' '.join(names)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
next_data = self._search_nextjs_data(webpage, display_id)
episode = next_data['props']['pageProps']['episode']
id = episode['id']
title = (episode.get('title')
or self._og_search_title(webpage, fatal=False)
or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
url = episode['m3u8']
formats = self._extract_m3u8_formats(url, display_id, ext='ts')
self._sort_formats(formats)
show = traverse_obj(episode, ('show', 'title'))
show_id = traverse_obj(episode, ('show', 'id'))
show_json = None
app_slug = (self._html_search_regex(
'<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_',
webpage, 'app slug', fatal=False) or next_data.get('buildId'))
show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl'))
if app_slug and show_slug and '/' in show_slug:
show_slug = show_slug.rsplit('/', 1)[1]
show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json'
show_json = self._download_json(show_json_url, display_id, fatal=False)
host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0))
or traverse_obj(episode, ('speakers', 0)))
host_nick = traverse_obj(host, ('linkObj', 'resourceUrl'))
host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None
cast = list(filter(None, [
self.try_get_user_name(u) for u in
traverse_obj(episode, (('speakers', 'callerTags'), ...)) or []
]))
episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or []
episode_number = next(
(len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id),
None)
return {
'id': id,
'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': traverse_obj(episode, ('show', 'photo')),
'description': episode.get('description'),
'uploader': self.try_get_user_name(host) if host else None,
'timestamp': episode.get('publishedAt'),
'uploader_id': host_nick,
'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')),
'channel': show,
'channel_id': show_id,
'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')),
'duration': float_or_none(episode.get('runtime')),
'view_count': int_or_none(episode.get('plays')),
'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')),
'cast': cast if cast else None,
'series': show,
'series_id': show_id,
'episode': title,
'episode_number': episode_number,
'episode_id': id
}

View file

@ -190,6 +190,7 @@ from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE from .byutv import BYUtvIE
from .c56 import C56IE from .c56 import C56IE
from .cableav import CableAVIE from .cableav import CableAVIE
from .callin import CallinIE
from .cam4 import CAM4IE from .cam4 import CAM4IE
from .camdemy import ( from .camdemy import (
CamdemyIE, CamdemyIE,