[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz
This commit is contained in:
felix 2021-05-23 18:34:49 +02:00 committed by pukkandan
parent 4d85fbbdbb
commit cdb19aa4c2
No known key found for this signature in database
GPG key ID: 0F00D95A001F4698
Notes: pukkandan 2021-06-13 22:41:29 +05:30
This also adds extracting storyboards from DASH manifest as mhtml
6 changed files with 248 additions and 16 deletions

View file

@ -22,6 +22,7 @@ from .http import HttpFD
from .rtmp import RtmpFD from .rtmp import RtmpFD
from .rtsp import RtspFD from .rtsp import RtspFD
from .ism import IsmFD from .ism import IsmFD
from .mhtml import MhtmlFD
from .niconico import NiconicoDmcFD from .niconico import NiconicoDmcFD
from .youtube_live_chat import YoutubeLiveChatReplayFD from .youtube_live_chat import YoutubeLiveChatReplayFD
from .external import ( from .external import (
@ -39,6 +40,7 @@ PROTOCOL_MAP = {
'f4m': F4mFD, 'f4m': F4mFD,
'http_dash_segments': DashSegmentsFD, 'http_dash_segments': DashSegmentsFD,
'ism': IsmFD, 'ism': IsmFD,
'mhtml': MhtmlFD,
'niconico_dmc': NiconicoDmcFD, 'niconico_dmc': NiconicoDmcFD,
'youtube_live_chat_replay': YoutubeLiveChatReplayFD, 'youtube_live_chat_replay': YoutubeLiveChatReplayFD,
} }

202
yt_dlp/downloader/mhtml.py Normal file
View file

@ -0,0 +1,202 @@
# coding: utf-8
from __future__ import unicode_literals
import io
import quopri
import re
import uuid
from .fragment import FragmentFD
from ..utils import (
escapeHTML,
formatSeconds,
srt_subtitles_timecode,
urljoin,
)
from ..version import __version__ as YT_DLP_VERSION
class MhtmlFD(FragmentFD):
FD_NAME = 'mhtml'
_STYLESHEET = """\
html, body {
margin: 0;
padding: 0;
height: 100vh;
}
html {
overflow-y: scroll;
scroll-snap-type: y mandatory;
}
body {
scroll-snap-type: y mandatory;
display: flex;
flex-flow: column;
}
body > figure {
max-width: 100vw;
max-height: 100vh;
scroll-snap-align: center;
}
body > figure > figcaption {
text-align: center;
height: 2.5em;
}
body > figure > img {
display: block;
margin: auto;
max-width: 100%;
max-height: calc(100vh - 5em);
}
"""
_STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
_STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
@staticmethod
def _escape_mime(s):
return '=?utf-8?Q?' + (b''.join(
bytes((b,)) if b >= 0x20 else b'=%02X' % b
for b in quopri.encodestring(s.encode('utf-8'), header=True)
)).decode('us-ascii') + '?='
def _gen_cid(self, i, fragment, frag_boundary):
return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
def _gen_stub(self, *, fragments, frag_boundary, title):
output = io.StringIO()
output.write((
'<!DOCTYPE html>'
'<html>'
'<head>'
'' '<meta name="generator" content="yt-dlp {version}">'
'' '<title>{title}</title>'
'' '<style>{styles}</style>'
'<body>'
).format(
version=escapeHTML(YT_DLP_VERSION),
styles=self._STYLESHEET,
title=escapeHTML(title)
))
t0 = 0
for i, frag in enumerate(fragments):
output.write('<figure>')
try:
t1 = t0 + frag['duration']
output.write((
'<figcaption>Slide #{num}: {t0} {t1} (duration: {duration})</figcaption>'
).format(
num=i + 1,
t0=srt_subtitles_timecode(t0),
t1=srt_subtitles_timecode(t1),
duration=formatSeconds(frag['duration'], msec=True)
))
except (KeyError, ValueError, TypeError):
t1 = None
output.write((
'<figcaption>Slide #{num}</figcaption>'
).format(num=i + 1))
output.write('<img src="cid:{cid}">'.format(
cid=self._gen_cid(i, frag, frag_boundary)))
output.write('</figure>')
t0 = t1
return output.getvalue()
def real_download(self, filename, info_dict):
fragment_base_url = info_dict.get('fragment_base_url')
fragments = info_dict['fragments'][:1] if self.params.get(
'test', False) else info_dict['fragments']
title = info_dict['title']
origin = info_dict['webpage_url']
ctx = {
'filename': filename,
'total_frags': len(fragments),
}
self._prepare_and_start_frag_download(ctx)
extra_state = ctx.setdefault('extra_state', {
'header_written': False,
'mime_boundary': str(uuid.uuid4()).replace('-', ''),
})
frag_boundary = extra_state['mime_boundary']
if not extra_state['header_written']:
stub = self._gen_stub(
fragments=fragments,
frag_boundary=frag_boundary,
title=title
)
ctx['dest_stream'].write((
'MIME-Version: 1.0\r\n'
'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
'Subject: {title}\r\n'
'Content-type: multipart/related; '
'' 'boundary="{boundary}"; '
'' 'type="text/html"\r\n'
'X.yt-dlp.Origin: {origin}\r\n'
'\r\n'
'--{boundary}\r\n'
'Content-Type: text/html; charset=utf-8\r\n'
'Content-Length: {length}\r\n'
'\r\n'
'{stub}\r\n'
).format(
origin=origin,
boundary=frag_boundary,
length=len(stub),
title=self._escape_mime(title),
stub=stub
).encode('utf-8'))
extra_state['header_written'] = True
for i, fragment in enumerate(fragments):
if (i + 1) <= ctx['fragment_index']:
continue
fragment_url = urljoin(fragment_base_url, fragment['path'])
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
if not success:
continue
mime_type = b'image/jpeg'
if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
mime_type = b'image/png'
if frag_content.startswith((b'GIF87a', b'GIF89a')):
mime_type = b'image/gif'
if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP':
mime_type = b'image/webp'
frag_header = io.BytesIO()
frag_header.write(
b'--%b\r\n' % frag_boundary.encode('us-ascii'))
frag_header.write(
b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
frag_header.write(
b'Content-type: %b\r\n' % mime_type)
frag_header.write(
b'Content-length: %u\r\n' % len(frag_content))
frag_header.write(
b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
frag_header.write(
b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
frag_header.write(b'\r\n')
self._append_fragment(
ctx, frag_header.getvalue() + frag_content + b'\r\n')
ctx['dest_stream'].write(
b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
self._finish_frag_download(ctx)
return True

View file

@ -24,7 +24,7 @@ class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '68993eda72ef62386a15ea2cf3c93107', 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
'info_dict': { 'info_dict': {
'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
@ -32,9 +32,9 @@ class CanvasIE(InfoExtractor):
'title': 'Nachtwacht: De Greystook', 'title': 'Nachtwacht: De Greystook',
'description': 'Nachtwacht: De Greystook', 'description': 'Nachtwacht: De Greystook',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1468.04, 'duration': 1468.02,
}, },
'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], 'expected_warnings': ['is not a supported codec'],
}, { }, {
'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'only_matching': True, 'only_matching': True,

View file

@ -2126,6 +2126,7 @@ class InfoExtractor(object):
format_id.append(str(format_index)) format_id.append(str(format_index))
f = { f = {
'format_id': '-'.join(format_id), 'format_id': '-'.join(format_id),
'format_note': name,
'format_index': format_index, 'format_index': format_index,
'url': manifest_url, 'url': manifest_url,
'manifest_url': m3u8_url, 'manifest_url': m3u8_url,
@ -2637,7 +2638,7 @@ class InfoExtractor(object):
mime_type = representation_attrib['mimeType'] mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
if content_type in ('video', 'audio', 'text'): if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
base_url = '' base_url = ''
for element in (representation, adaptation_set, period, mpd_doc): for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL')) base_url_e = element.find(_add_ns('BaseURL'))
@ -2654,9 +2655,15 @@ class InfoExtractor(object):
url_el = representation.find(_add_ns('BaseURL')) url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth')) bandwidth = int_or_none(representation_attrib.get('bandwidth'))
if representation_id is not None:
format_id = representation_id
else:
format_id = content_type
if mpd_id:
format_id = mpd_id + '-' + format_id
if content_type in ('video', 'audio'): if content_type in ('video', 'audio'):
f = { f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'format_id': format_id,
'manifest_url': mpd_url, 'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type), 'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')), 'width': int_or_none(representation_attrib.get('width')),
@ -2676,6 +2683,17 @@ class InfoExtractor(object):
'manifest_url': mpd_url, 'manifest_url': mpd_url,
'filesize': filesize, 'filesize': filesize,
} }
elif mime_type == 'image/jpeg':
# See test case in VikiIE
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
f = {
'format_id': format_id,
'ext': 'mhtml',
'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)',
'acodec': 'none',
'vcodec': 'none',
}
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers): def prepare_template(template_name, identifiers):
@ -2694,6 +2712,7 @@ class InfoExtractor(object):
t += c t += c
# Next, $...$ templates are translated to their # Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator # %(...) counterparts to be used with % operator
if representation_id is not None:
t = t.replace('$RepresentationID$', representation_id) t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
@ -2811,7 +2830,7 @@ class InfoExtractor(object):
'url': mpd_url or base_url, 'url': mpd_url or base_url,
'fragment_base_url': base_url, 'fragment_base_url': base_url,
'fragments': [], 'fragments': [],
'protocol': 'http_dash_segments', 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
}) })
if 'initialization_url' in representation_ms_info: if 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url'] initialization_url = representation_ms_info['initialization_url']
@ -2822,7 +2841,7 @@ class InfoExtractor(object):
else: else:
# Assuming direct URL to unfragmented media. # Assuming direct URL to unfragmented media.
f['url'] = base_url f['url'] = base_url
if content_type in ('video', 'audio'): if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
formats.append(f) formats.append(f)
elif content_type == 'text': elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f) subtitles.setdefault(lang or 'und', []).append(f)

View file

@ -142,6 +142,7 @@ class VikiIE(VikiBaseIE):
IE_NAME = 'viki' IE_NAME = 'viki'
_VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
_TESTS = [{ _TESTS = [{
'note': 'Free non-DRM video with storyboards in MPD',
'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1',
'info_dict': { 'info_dict': {
'id': '1175236v', 'id': '1175236v',
@ -155,7 +156,6 @@ class VikiIE(VikiBaseIE):
'params': { 'params': {
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': { 'info_dict': {
@ -173,7 +173,6 @@ class VikiIE(VikiBaseIE):
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'skip': 'Blocked in the US', 'skip': 'Blocked in the US',
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
# clip # clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@ -225,7 +224,6 @@ class VikiIE(VikiBaseIE):
'params': { 'params': {
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
# youtube external # youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@ -264,7 +262,6 @@ class VikiIE(VikiBaseIE):
'params': { 'params': {
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View file

@ -2244,6 +2244,17 @@ def unescapeHTML(s):
r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
def escapeHTML(text):
return (
text
.replace('&', '&amp;')
.replace('<', '&lt;')
.replace('>', '&gt;')
.replace('"', '&quot;')
.replace("'", '&#39;')
)
def process_communicate_or_kill(p, *args, **kwargs): def process_communicate_or_kill(p, *args, **kwargs):
try: try:
return p.communicate(*args, **kwargs) return p.communicate(*args, **kwargs)
@ -2323,13 +2334,14 @@ def decodeOption(optval):
return optval return optval
def formatSeconds(secs, delim=':'): def formatSeconds(secs, delim=':', msec=False):
if secs > 3600: if secs > 3600:
return '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60) ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60)
elif secs > 60: elif secs > 60:
return '%d%s%02d' % (secs // 60, delim, secs % 60) ret = '%d%s%02d' % (secs // 60, delim, secs % 60)
else: else:
return '%d' % secs ret = '%d' % secs
return '%s.%03d' % (ret, secs % 1) if msec else ret
def make_HTTPS_handler(params, **kwargs): def make_HTTPS_handler(params, **kwargs):