[niconico] Rewrite NiconicoIE (#3018)

Closes https://github.com/yt-dlp/yt-dlp/issues/2636, partially fixes https://github.com/yt-dlp/yt-dlp/issues/367
Authored by: Lesmiscore
This commit is contained in:
Lesmiscore (Naoya Ozaki) 2022-03-17 21:22:14 +09:00 committed by GitHub
parent 497a6c5f57
commit 7bdcb4a40e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 236 additions and 306 deletions

View file

@ -1683,6 +1683,9 @@ The following extractors use this feature:
#### vikichannel #### vikichannel
* `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers`
#### niconico
* `segment_duration`: Segment duration in milliseconds for HLS-DMC formats. Use it at your own risk since this feature **may result in your account termination.**
#### youtubewebarchive #### youtubewebarchive
* `check_all`: Try to check more at the cost of more requests. One or more of `thumbnails`, `captures` * `check_all`: Try to check more at the cost of more requests. One or more of `thumbnails`, `captures`

View file

@ -2,36 +2,36 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import datetime import datetime
import itertools
import functools import functools
import itertools
import json import json
import re import re
import time
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..postprocessor.ffmpeg import FFmpegPostProcessor
from ..compat import ( from ..compat import (
compat_str,
compat_parse_qs, compat_parse_qs,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_HTTPError, compat_HTTPError,
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
dict_get, OnDemandPagedList,
bug_reports_message,
clean_html,
float_or_none, float_or_none,
int_or_none, int_or_none,
OnDemandPagedList, join_nonempty,
parse_duration, parse_duration,
parse_filesize,
parse_iso8601, parse_iso8601,
PostProcessingError,
remove_start, remove_start,
str_or_none,
traverse_obj, traverse_obj,
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_timestamp, update_url_query,
url_or_none,
urlencode_postdata, urlencode_postdata,
xpath_text,
) )
@ -41,7 +41,7 @@ class NiconicoIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215', 'url': 'http://www.nicovideo.jp/watch/sm22312215',
'md5': 'a5bad06f1347452102953f323c69da34s', 'md5': 'd1a75c0823e2f629128c43e1212760f9',
'info_dict': { 'info_dict': {
'id': 'sm22312215', 'id': 'sm22312215',
'ext': 'mp4', 'ext': 'mp4',
@ -164,14 +164,23 @@ class NiconicoIE(InfoExtractor):
}, { }, {
'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
'only_matching': True, 'only_matching': True,
}, {
'note': 'a video that is only served as an ENCRYPTED HLS.',
'url': 'https://www.nicovideo.jp/watch/so38016254',
'only_matching': True,
}] }]
_VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico' _NETRC_MACHINE = 'niconico'
_COMMENT_API_ENDPOINTS = (
'https://nvcomment.nicovideo.jp/legacy/api.json',
'https://nmsg.nicovideo.jp/api.json',)
_API_HEADERS = { _API_HEADERS = {
'X-Frontend-ID': '6', 'X-Frontend-ID': '6',
'X-Frontend-Version': '0' 'X-Frontend-Version': '0',
'X-Niconico-Language': 'en-us',
'Referer': 'https://www.nicovideo.jp/',
'Origin': 'https://www.nicovideo.jp',
} }
def _real_initialize(self): def _real_initialize(self):
@ -189,10 +198,17 @@ class NiconicoIE(InfoExtractor):
'mail_tel': username, 'mail_tel': username,
'password': password, 'password': password,
} }
self._request_webpage(
'https://account.nicovideo.jp/login', None,
note='Acquiring Login session')
urlh = self._request_webpage( urlh = self._request_webpage(
'https://account.nicovideo.jp/api/v1/login', None, 'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None,
note='Logging in', errnote='Unable to log in', note='Logging in', errnote='Unable to log in',
data=urlencode_postdata(login_form_strs)) data=urlencode_postdata(login_form_strs),
headers={
'Referer': 'https://account.nicovideo.jp/login',
'Content-Type': 'application/x-www-form-urlencoded',
})
if urlh is False: if urlh is False:
login_ok = False login_ok = False
else: else:
@ -204,8 +220,8 @@ class NiconicoIE(InfoExtractor):
return login_ok return login_ok
def _get_heartbeat_info(self, info_dict): def _get_heartbeat_info(self, info_dict):
video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/')
dmc_protocol = info_dict['_expected_protocol']
api_data = ( api_data = (
info_dict.get('_api_data') info_dict.get('_api_data')
@ -220,49 +236,50 @@ class NiconicoIE(InfoExtractor):
session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0])
def ping(): def ping():
status = try_get( tracking_id = traverse_obj(api_data, ('media', 'delivery', 'trackingId'))
self._download_json( if tracking_id:
'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id, tracking_url = update_url_query('https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', {'t': tracking_id})
query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])}, watch_request_response = self._download_json(
note='Acquiring permission for downloading video', tracking_url, video_id,
headers=self._API_HEADERS), note='Acquiring permission for downloading video', fatal=False,
lambda x: x['meta']['status']) headers=self._API_HEADERS)
if status != 200: if traverse_obj(watch_request_response, ('meta', 'status')) != 200:
self.report_warning('Failed to acquire permission for playing video. The video may not download.') self.report_warning('Failed to acquire permission for playing video. Video download may fail.')
yesno = lambda x: 'yes' if x else 'no' yesno = lambda x: 'yes' if x else 'no'
# m3u8 (encryption) if dmc_protocol == 'http':
if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None:
protocol = 'm3u8'
encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption']
session_api_http_parameters = {
'parameters': {
'hls_parameters': {
'encryption': {
encryption: {
'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']),
'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri'])
}
},
'transfer_preset': '',
'use_ssl': yesno(session_api_endpoint['isSsl']),
'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
'segment_duration': 6000,
}
}
}
# http
else:
protocol = 'http' protocol = 'http'
session_api_http_parameters = { protocol_parameters = {
'parameters': { 'http_output_download_parameters': {
'http_output_download_parameters': { 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']),
'use_ssl': yesno(session_api_endpoint['isSsl']), 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']),
'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
}
} }
} }
elif dmc_protocol == 'hls':
protocol = 'm3u8'
segment_duration = try_get(self._configuration_arg('segment_duration'), lambda x: int(x[0])) or 6000
parsed_token = self._parse_json(session_api_data['token'], video_id)
encryption = traverse_obj(api_data, ('media', 'delivery', 'encryption'))
protocol_parameters = {
'hls_parameters': {
'segment_duration': segment_duration,
'transfer_preset': '',
'use_ssl': yesno(session_api_data['urls'][0]['isSsl']),
'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']),
}
}
if 'hls_encryption' in parsed_token and encryption:
protocol_parameters['hls_parameters']['encryption'] = {
parsed_token['hls_encryption']: {
'encrypted_key': encryption['encryptedKey'],
'key_uri': encryption['keyUri'],
}
}
else:
protocol = 'm3u8_native'
else:
raise ExtractorError(f'Unsupported DMC protocol: {dmc_protocol}')
session_response = self._download_json( session_response = self._download_json(
session_api_endpoint['url'], video_id, session_api_endpoint['url'], video_id,
@ -296,11 +313,13 @@ class NiconicoIE(InfoExtractor):
'lifetime': session_api_data.get('heartbeatLifetime') 'lifetime': session_api_data.get('heartbeatLifetime')
} }
}, },
'priority': session_api_data.get('priority'), 'priority': session_api_data['priority'],
'protocol': { 'protocol': {
'name': 'http', 'name': 'http',
'parameters': { 'parameters': {
'http_parameters': session_api_http_parameters 'http_parameters': {
'parameters': protocol_parameters
}
} }
}, },
'recipe_id': session_api_data.get('recipeId'), 'recipe_id': session_api_data.get('recipeId'),
@ -328,36 +347,35 @@ class NiconicoIE(InfoExtractor):
return info_dict, heartbeat_info_dict return info_dict, heartbeat_info_dict
def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dmc_protocol):
def parse_format_id(id_code):
mobj = re.match(r'''(?x)
(?:archive_)?
(?:(?P<codec>[^_]+)_)?
(?:(?P<br>[\d]+)kbps_)?
(?:(?P<res>[\d+]+)p_)?
''', '%s_' % id_code)
return mobj.groupdict() if mobj else {}
protocol = 'niconico_dmc' if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'):
format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) return None
vdict = parse_format_id(video_quality['id'])
adict = parse_format_id(audio_quality['id']) def extract_video_quality(video_quality):
resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')} return parse_filesize('%sB' % self._search_regex(
vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float) r'\| ([0-9]*\.?[0-9]*[MK])', video_quality, 'vbr', default=''))
format_id = '-'.join(
[remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol])
vid_qual_label = traverse_obj(video_quality, ('metadata', 'label'))
vid_quality = traverse_obj(video_quality, ('metadata', 'bitrate'))
return { return {
'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']), 'url': 'niconico_dmc:%s/%s/%s' % (video_id, video_quality['id'], audio_quality['id']),
'format_id': format_id, 'format_id': format_id,
'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str), 'format_note': join_nonempty('DMC', vid_qual_label, dmc_protocol.upper(), delim=' '),
'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
'vcodec': vdict.get('codec'), 'acodec': 'aac',
'acodec': adict.get('codec'), 'vcodec': 'h264',
'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')), 'abr': float_or_none(traverse_obj(audio_quality, ('metadata', 'bitrate')), 1000),
'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')), 'vbr': float_or_none(vid_quality if vid_quality > 0 else extract_video_quality(vid_qual_label), 1000),
'height': int_or_none(resolution.get('height', vdict.get('res'))), 'height': traverse_obj(video_quality, ('metadata', 'resolution', 'height')),
'width': int_or_none(resolution.get('width')), 'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')),
'quality': -2 if 'low' in format_id else -1, # Default quality value is -1 'quality': -2 if 'low' in video_quality['id'] else None,
'protocol': protocol, 'protocol': 'niconico_dmc',
'_expected_protocol': dmc_protocol,
'http_headers': { 'http_headers': {
'Origin': 'https://www.nicovideo.jp', 'Origin': 'https://www.nicovideo.jp',
'Referer': 'https://www.nicovideo.jp/watch/' + video_id, 'Referer': 'https://www.nicovideo.jp/watch/' + video_id,
@ -367,248 +385,157 @@ class NiconicoIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# Get video webpage for API data. try:
webpage, handle = self._download_webpage_handle( webpage, handle = self._download_webpage_handle(
'http://www.nicovideo.jp/watch/' + video_id, video_id) 'http://www.nicovideo.jp/watch/' + video_id, video_id)
if video_id.startswith('so'): if video_id.startswith('so'):
video_id = self._match_id(handle.geturl()) video_id = self._match_id(handle.geturl())
api_data = self._parse_json(self._html_search_regex( api_data = self._parse_json(self._html_search_regex(
'data-api-data="([^"]+)"', webpage, 'data-api-data="([^"]+)"', webpage,
'API data', default='{}'), video_id) 'API data', default='{}'), video_id)
except ExtractorError as e:
try:
api_data = self._download_json(
'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id,
note='Downloading API JSON', errnote='Unable to fetch data')['data']
except ExtractorError:
if not isinstance(e.cause, compat_HTTPError):
raise
webpage = e.cause.read().decode('utf-8', 'replace')
error_msg = self._html_search_regex(
r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>',
webpage, 'error reason', default=None)
if not error_msg:
raise
raise ExtractorError(re.sub(r'\s+', ' ', error_msg), expected=True)
def get_video_info_web(items):
return dict_get(api_data['video'], items)
# Get video info
video_info_xml = self._download_xml(
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
video_id, note='Downloading video info page')
def get_video_info_xml(items):
if not isinstance(items, list):
items = [items]
for item in items:
ret = xpath_text(video_info_xml, './/' + item)
if ret:
return ret
if get_video_info_xml('error'):
error_code = get_video_info_xml('code')
if error_code == 'DELETED':
raise ExtractorError('The video has been deleted.',
expected=True)
elif error_code == 'NOT_FOUND':
raise ExtractorError('The video is not found.',
expected=True)
elif error_code == 'COMMUNITY':
self.to_screen('%s: The video is community members only.' % video_id)
else:
raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code))
# Start extracting video formats
formats = [] formats = []
# Get HTML5 videos info def get_video_info(*items, get_first=True, **kwargs):
quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie']) return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs)
if not quality_info:
raise ExtractorError('The video can\'t be downloaded', expected=True)
for audio_quality in quality_info.get('audios') or {}: quality_info = api_data['media']['delivery']['movie']
for video_quality in quality_info.get('videos') or {}: session_api_data = quality_info['session']
if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): for (audio_quality, video_quality, protocol) in itertools.product(quality_info['audios'], quality_info['videos'], session_api_data['protocols']):
continue fmt = self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol)
formats.append(self._extract_format_for_quality( if fmt:
api_data, video_id, audio_quality, video_quality)) formats.append(fmt)
# Get flv/swf info
timestamp = None
video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url'])
if video_real_url:
is_economy = video_real_url.endswith('low')
if is_economy:
self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams')
# Invoking ffprobe to determine resolution
pp = FFmpegPostProcessor(self._downloader)
cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n')
self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe'))
try:
metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies])
except PostProcessingError as err:
raise ExtractorError(err.msg, expected=True)
v_stream = a_stream = {}
# Some complex swf files doesn't have video stream (e.g. nm4809023)
for stream in metadata['streams']:
if stream['codec_type'] == 'video':
v_stream = stream
elif stream['codec_type'] == 'audio':
a_stream = stream
# Community restricted videos seem to have issues with the thumb API not returning anything at all
filesize = int(
(get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low'))
or metadata['format']['size']
)
extension = (
get_video_info_xml('movie_type')
or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name']
)
# 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'.
timestamp = (
parse_iso8601(get_video_info_web('first_retrieve'))
or unified_timestamp(get_video_info_web('postedDateTime'))
)
metadata_timestamp = (
parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time']))
or timestamp if extension != 'mp4' else 0
)
# According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts
smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00')
is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0
# If movie file size is unstable, old server movie is not source movie.
if filesize > 1:
formats.append({
'url': video_real_url,
'format_id': 'smile' if not is_economy else 'smile_low',
'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality',
'ext': extension,
'container': extension,
'vcodec': v_stream.get('codec_name'),
'acodec': a_stream.get('codec_name'),
# Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209)
'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000),
'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000),
'abr': int_or_none(a_stream.get('bit_rate'), scale=1000),
'height': int_or_none(v_stream.get('height')),
'width': int_or_none(v_stream.get('width')),
'source_preference': 5 if not is_economy else -2,
'quality': 5 if is_source and not is_economy else None,
'filesize': filesize
})
self._sort_formats(formats) self._sort_formats(formats)
# Start extracting information # Start extracting information
title = ( tags = None
get_video_info_xml('title') # prefer to get the untranslated original title if webpage:
or get_video_info_web(['originalTitle', 'title']) # use og:video:tag (not logged in)
or self._og_search_title(webpage, default=None) og_video_tags = re.finditer(r'<meta\s+property="og:video:tag"\s*content="(.*?)">', webpage)
or self._html_search_regex( tags = list(filter(None, (clean_html(x.group(1)) for x in og_video_tags)))
r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', if not tags:
webpage, 'video title')) # use keywords and split with comma (not logged in)
kwds = self._html_search_meta('keywords', webpage, default=None)
watch_api_data_string = self._html_search_regex( if kwds:
r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', tags = [x for x in kwds.split(',') if x]
webpage, 'watch api data', default=None) if not tags:
watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} # find in json (logged in)
video_detail = watch_api_data.get('videoDetail', {}) tags = traverse_obj(api_data, ('tag', 'items', ..., 'name'))
thumbnail = (
self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None)
or dict_get( # choose highest from 720p to 240p
get_video_info_web('thumbnail'),
['ogp', 'player', 'largeUrl', 'middleUrl', 'url'])
or self._html_search_meta('image', webpage, 'thumbnail', default=None)
or video_detail.get('thumbnail'))
description = get_video_info_web('description')
if not timestamp:
match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
if match:
timestamp = parse_iso8601(match.replace('+', ':00+'))
if not timestamp and video_detail.get('postedAt'):
timestamp = parse_iso8601(
video_detail['postedAt'].replace('/', '-'),
delimiter=' ', timezone=datetime.timedelta(hours=9))
timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt']))
view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount']))
if not view_count:
match = self._html_search_regex(
r'>Views: <strong[^>]*>([^<]+)</strong>',
webpage, 'view count', default=None)
if match:
view_count = int_or_none(match.replace(',', ''))
view_count = (
view_count
or video_detail.get('viewCount')
or try_get(api_data, lambda x: x['video']['count']['view']))
comment_count = (
int_or_none(get_video_info_web('comment_num'))
or video_detail.get('commentCount')
or try_get(api_data, lambda x: x['video']['count']['comment']))
if not comment_count:
match = self._html_search_regex(
r'>Comments: <strong[^>]*>([^<]+)</strong>',
webpage, 'comment count', default=None)
if match:
comment_count = int_or_none(match.replace(',', ''))
duration = (parse_duration(
get_video_info_web('length')
or self._html_search_meta(
'video:duration', webpage, 'video duration', default=None))
or video_detail.get('length')
or get_video_info_web('duration'))
webpage_url = get_video_info_web('watch_url') or url
# for channel movie and community movie
channel_id = try_get(
api_data,
(lambda x: x['channel']['globalId'],
lambda x: x['community']['globalId']))
channel = try_get(
api_data,
(lambda x: x['channel']['name'],
lambda x: x['community']['name']))
# Note: cannot use api_data.get('owner', {}) because owner may be set to "null"
# in the JSON, which will cause None to be returned instead of {}.
owner = try_get(api_data, lambda x: x.get('owner'), dict) or {}
uploader_id = str_or_none(
get_video_info_web(['ch_id', 'user_id'])
or owner.get('id')
or channel_id
)
uploader = (
get_video_info_web(['ch_name', 'user_nickname'])
or owner.get('nickname')
or channel
)
return { return {
'id': video_id, 'id': video_id,
'_api_data': api_data, '_api_data': api_data,
'title': title, 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None),
'formats': formats, 'formats': formats,
'thumbnail': thumbnail, 'thumbnail': get_video_info('thumbnail', 'url') or self._html_search_meta(
'description': description, ('image', 'og:image'), webpage, 'thumbnail', default=None),
'uploader': uploader, 'description': clean_html(get_video_info('description')),
'timestamp': timestamp, 'uploader': traverse_obj(api_data, ('owner', 'nickname')),
'uploader_id': uploader_id, 'timestamp': parse_iso8601(get_video_info('registeredAt')) or parse_iso8601(
'channel': channel, self._html_search_meta('video:release_date', webpage, 'date published', default=None)),
'channel_id': channel_id, 'uploader_id': traverse_obj(api_data, ('owner', 'id')),
'view_count': view_count, 'channel': traverse_obj(api_data, ('channel', 'name'), ('community', 'name')),
'comment_count': comment_count, 'channel_id': traverse_obj(api_data, ('channel', 'id'), ('community', 'id')),
'duration': duration, 'view_count': int_or_none(get_video_info('count', 'view')),
'webpage_url': webpage_url, 'tags': tags,
'genre': traverse_obj(api_data, ('genre', 'label'), ('genre', 'key')),
'comment_count': get_video_info('count', 'comment', expected_type=int),
'duration': (
parse_duration(self._html_search_meta('video:duration', webpage, 'video duration', default=None))
or get_video_info('duration')),
'webpage_url': url_or_none(url) or f'https://www.nicovideo.jp/watch/{video_id}',
'subtitles': self.extract_subtitles(video_id, api_data, session_api_data),
} }
def _get_subtitles(self, video_id, api_data, session_api_data):
comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey'))
user_id_str = session_api_data.get('serviceUserId')
thread_ids = [x for x in traverse_obj(api_data, ('comment', 'threads')) or [] if x['isActive']]
raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key)
if not raw_danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
return {
'comments': [{
'ext': 'json',
'data': json.dumps(raw_danmaku),
}],
}
def _extract_all_comments(self, video_id, threads, user_id, user_key):
auth_data = {
'user_id': user_id,
'userkey': user_key,
} if user_id and user_key else {'user_id': ''}
# Request Start
post_data = [{'ping': {'content': 'rs:0'}}]
for i, thread in enumerate(threads):
thread_id = thread['id']
thread_fork = thread['fork']
# Post Start (2N)
post_data.append({'ping': {'content': f'ps:{i * 2}'}})
post_data.append({'thread': {
'fork': thread_fork,
'language': 0,
'nicoru': 3,
'scores': 1,
'thread': thread_id,
'version': '20090904',
'with_global': 1,
**auth_data,
}})
# Post Final (2N)
post_data.append({'ping': {'content': f'pf:{i * 2}'}})
# Post Start (2N+1)
post_data.append({'ping': {'content': f'ps:{i * 2 + 1}'}})
post_data.append({'thread_leaves': {
# format is '<bottom of minute range>-<top of minute range>:<comments per minute>,<total last comments'
# unfortunately NND limits (deletes?) comment returns this way, so you're only able to grab the last 1000 per language
'content': '0-999999:999999,999999,nicoru:999999',
'fork': thread_fork,
'language': 0,
'nicoru': 3,
'scores': 1,
'thread': thread_id,
**auth_data,
}})
# Post Final (2N+1)
post_data.append({'ping': {'content': f'pf:{i * 2 + 1}'}})
# Request Final
post_data.append({'ping': {'content': 'rf:0'}})
for api_url in self._COMMENT_API_ENDPOINTS:
comments = self._download_json(
api_url, video_id, data=json.dumps(post_data).encode(), fatal=False,
headers={
'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id,
'Origin': 'https://www.nicovideo.jp',
'Content-Type': 'text/plain;charset=UTF-8',
},
note='Downloading comments', errnote=f'Failed to access endpoint {api_url}')
if comments:
return comments
class NiconicoPlaylistBaseIE(InfoExtractor): class NiconicoPlaylistBaseIE(InfoExtractor):
_PAGE_SIZE = 100 _PAGE_SIZE = 100