diff --git a/AUTHORS b/AUTHORS index 3d6985ab6f..47f12a9eef 100644 --- a/AUTHORS +++ b/AUTHORS @@ -110,3 +110,4 @@ Shaya Goldberg Paul Hartmann Frans de Jonge Robin de Rooij +Ryan Schmidt diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 6336dd317c..bcc69a7783 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -138,7 +138,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles.keys()), 5) + self.assertTrue(len(subtitles.keys()) >= 6) def test_list_subtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') @@ -247,7 +247,7 @@ class TestVimeoSubtitles(BaseTestSubtitles): def test_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '26399116d23ae3cf2c087cea94bc43b4') + self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') def test_subtitles_lang(self): self.DL.params['writesubtitles'] = True @@ -334,7 +334,7 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['cs'])) - self.assertEqual(md5(subtitles['cs']), '9bf52d9549533c32c427e264bf0847d4') + self.assertTrue(len(subtitles['cs']) > 20000) def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a4fab540ba..13292073c2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -189,6 +189,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .historicfilms import HistoricFilmsIE +from .history import HistoryIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE from .hostingbulk import HostingBulkIE diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 98e1443ab0..c193e66cad 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -50,7 +50,7 @@ class BambuserIE(InfoExtractor): 'duration': int(info['length']), 'view_count': int(info['views_total']), 'uploader': info['username'], - 'uploader_id': info['uid'], + 'uploader_id': info['owner']['uid'], } diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 126c8824cc..f23e395451 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -273,7 +273,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor): formats, subtitles = self._download_media_selector(programme_id) return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise # fallback to legacy playlist diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 4e79fea8f0..b38057f2f5 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -9,7 +9,7 @@ class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '634526ae978711f6b748fe0dd6c11f57', + 'md5': '1bff67111adb785c51d1b42959ec10e5', 'info_dict': { 'id': '5416503', 'ext': 'mp4', diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 5de5879b44..897f3a104c 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -16,7 +16,7 @@ from ..utils import ( class CamdemyIE(InfoExtractor): - _VALID_URL = r'http://www.camdemy.com/media/(?P\d+)' + _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 48742189a1..c784eedb96 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -665,7 +665,7 @@ class InfoExtractor(object): return RATING_TABLE.get(rating.lower(), None) def _family_friendly_search(self, html): - # See http://schema.org/VideoObj + # See http://schema.org/VideoObject family_friendly = self._html_search_meta('isFamilyFriendly', html) if not family_friendly: diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index ca274dff69..37c5c181f7 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -15,7 +15,7 @@ class DrTuberIE(InfoExtractor): 'id': '1740434', 'display_id': 'hot-perky-blonde-naked-golf', 'ext': 'mp4', - 'title': 'Hot Perky Blonde Naked Golf', + 'title': 'hot perky blonde naked golf', 'like_count': int, 'dislike_count': int, 'comment_count': int, @@ -36,7 +36,8 @@ class DrTuberIE(InfoExtractor): r'([^<]+)\s*-\s*Free', webpage, 'title') + [r'class="hd_title" style="[^"]+">([^<]+)', r'([^<]+) - \d+'], + webpage, 'title') thumbnail = self._html_search_regex( r'poster="([^"]+)"', diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 08ceee4ed7..510d4b1089 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -1,52 +1,71 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import int_or_none class FirstTVIE(InfoExtractor): - IE_NAME = 'firsttv' - IE_DESC = 'Видеоархив - Первый канал' - _VALID_URL = r'http://(?:www\.)?1tv\.ru/videoarchive/(?P<id>\d+)' + IE_NAME = '1tv' + IE_DESC = 'Первый канал' + _VALID_URL = r'http://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>.+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.1tv.ru/videoarchive/73390', - 'md5': '3de6390cf0cca4a5eae1d1d83895e5ad', + 'md5': '777f525feeec4806130f4f764bc18a4f', 'info_dict': { 'id': '73390', 'ext': 'mp4', 'title': 'Олимпийские канатные дороги', - 'description': 'md5:cc730d2bf4215463e37fff6a1e277b13', - 'thumbnail': 'http://img1.1tv.ru/imgsize640x360/PR20140210114657.JPG', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', 'duration': 149, + 'like_count': int, + 'dislike_count': int, }, 'skip': 'Only works from Russia', - } + }, { + 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', + 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + 'info_dict': { + 'id': '35930', + 'ext': 'mp4', + 'title': 'Наедине со всеми. Людмила Сенчина', + 'description': 'md5:89553aed1d641416001fe8d450f06cb9', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'duration': 2694, + }, + 'skip': 'Only works from Russia', + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id, 'Downloading page') video_url = self._html_search_regex( - r'''(?s)jwplayer\('flashvideoportal_1'\)\.setup\({.*?'file': '([^']+)'.*?}\);''', webpage, 'video URL') + r'''(?s)(?:jwplayer\('flashvideoportal_1'\)\.setup\({|var\s+playlistObj\s*=).*?'file'\s*:\s*'([^']+)'.*?}\);''', + webpage, 'video URL') title = self._html_search_regex( - r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', webpage, 'title') + [r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', + r"'title'\s*:\s*'([^']+)'"], webpage, 'title') description = self._html_search_regex( - r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', webpage, 'description', fatal=False) + r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', + webpage, 'description', default=None) or self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) - duration = self._og_search_property('video:duration', webpage, 'video duration', fatal=False) + duration = self._og_search_property( + 'video:duration', webpage, + 'video duration', fatal=False) - like_count = self._html_search_regex(r'title="Понравилось".*?/></label> \[(\d+)\]', - webpage, 'like count', fatal=False) - dislike_count = self._html_search_regex(r'title="Не понравилось".*?/></label> \[(\d+)\]', - webpage, 'dislike count', fatal=False) + like_count = self._html_search_regex( + r'title="Понравилось".*?/></label> \[(\d+)\]', + webpage, 'like count', default=None) + dislike_count = self._html_search_regex( + r'title="Не понравилось".*?/></label> \[(\d+)\]', + webpage, 'dislike count', default=None) return { 'id': video_id, diff --git a/youtube_dl/extractor/history.py b/youtube_dl/extractor/history.py new file mode 100644 index 0000000000..f86164afe7 --- /dev/null +++ b/youtube_dl/extractor/history.py @@ -0,0 +1,31 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class HistoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?history\.com/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' + + _TESTS = [{ + 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', + 'md5': '6fe632d033c92aa10b8d4a9be047a7c5', + 'info_dict': { + 'id': 'bLx5Dv5Aka1G', + 'ext': 'mp4', + 'title': "Bet You Didn't Know: Valentine's Day", + 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, + webpage, 'video url') + + return self.url_result(smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}})) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index f840f65321..89a2845fe2 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..compat import ( @@ -52,9 +51,9 @@ class NBCIE(InfoExtractor): class NBCNewsIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://www\.nbcnews\.com/ - ((video/.+?/(?P<id>\d+))| - (feature/[^/]+/(?P<title>.+))) + _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ + (?:video/.+?/(?P<id>\d+)| + (?:feature|nightly-news)/[^/]+/(?P<title>.+)) ''' _TESTS = [ @@ -89,6 +88,16 @@ class NBCNewsIE(InfoExtractor): 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', }, }, + { + 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', + 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', + 'info_dict': { + 'id': 'sekXqyTVnmN3', + 'ext': 'mp4', + 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', + 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', + }, + }, ] def _real_extract(self, url): @@ -107,13 +116,13 @@ class NBCNewsIE(InfoExtractor): 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, } else: - # "feature" pages use theplatform.com + # "feature" and "nightly-news" pages use theplatform.com title = mobj.group('title') webpage = self._download_webpage(url, title) bootstrap_json = self._search_regex( - r'var bootstrapJson = ({.+})\s*$', webpage, 'bootstrap json', - flags=re.MULTILINE) - bootstrap = json.loads(bootstrap_json) + r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', + webpage, 'bootstrap json', flags=re.MULTILINE) + bootstrap = self._parse_json(bootstrap_json, video_id) info = bootstrap['results'][0]['video'] mpxid = info['mpxId'] diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index c3ceb5f76d..e92b93285c 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,14 +1,30 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import hashlib +import time + from .common import InfoExtractor +from ..compat import ( + compat_urllib_request, +) from ..utils import ( int_or_none, ) +def _get_api_key(api_path): + if api_path.endswith('?'): + api_path = api_path[:-1] + + api_key = 'fb5f58a820353bd7095de526253c14fd' + a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) + return hashlib.md5(a.encode('ascii')).hexdigest() + + class StreamCZIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' + _API_URL = 'http://www.stream.cz/API' _TESTS = [{ 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', @@ -36,8 +52,11 @@ class StreamCZIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://www.stream.cz/API/episode/%s' % video_id, video_id) + api_path = '/episode/%s' % video_id + + req = compat_urllib_request.Request(self._API_URL + api_path) + req.add_header('Api-Password', _get_api_key(api_path)) + data = self._download_json(req, video_id) formats = [] for quality, video in enumerate(data['video_qualities']): diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 8a333f1d24..854d01beeb 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -52,7 +52,7 @@ class SunPornoIE(InfoExtractor): formats = [] quality = qualities(['mp4', 'flv']) - for video_url in re.findall(r'<source src="([^"]+)"', webpage): + for video_url in re.findall(r'<(?:source|video) src="([^"]+)"', webpage): video_ext = determine_ext(video_url) formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 110ed976de..1579822f2c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -2,6 +2,11 @@ from __future__ import unicode_literals import re import json +import time +import hmac +import binascii +import hashlib + from .subtitles import SubtitlesInfoExtractor from ..compat import ( @@ -11,6 +16,7 @@ from ..utils import ( determine_ext, ExtractorError, xpath_with_ns, + unsmuggle_url, ) _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) @@ -18,7 +24,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(SubtitlesInfoExtractor): _VALID_URL = r'''(?x) - (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/ + (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? |theplatform:)(?P<id>[^/\?&]+)''' @@ -38,9 +44,33 @@ class ThePlatformIE(SubtitlesInfoExtractor): }, } + @staticmethod + def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): + flags = '10' if include_qs else '00' + expiration_date = '%x' % (int(time.time()) + life) + + def str_to_hex(str): + return binascii.b2a_hex(str.encode('ascii')).decode('ascii') + + def hex_to_str(hex): + return binascii.a2b_hex(hex) + + relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0] + clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path)) + checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() + sig = flags + expiration_date + checksum + str_to_hex(sig_secret) + return '%s&sig=%s' % (url, sig) + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + mobj = re.match(self._VALID_URL, url) + provider_id = mobj.group('provider_id') video_id = mobj.group('id') + + if not provider_id: + provider_id = 'dJ5BDC' + if mobj.group('config'): config_url = url + '&form=json' config_url = config_url.replace('swf/', 'config/') @@ -48,8 +78,12 @@ class ThePlatformIE(SubtitlesInfoExtractor): config = self._download_json(config_url, video_id, 'Downloading config') smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' else: - smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' - 'format=smil&mbr=true'.format(video_id)) + smil_url = ('http://link.theplatform.com/s/{0}/{1}/meta.smil?' + 'format=smil&mbr=true'.format(provider_id, video_id)) + + sig = smuggled_data.get('sig') + if sig: + smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) meta = self._download_xml(smil_url, video_id) try: @@ -62,7 +96,7 @@ class ThePlatformIE(SubtitlesInfoExtractor): else: raise ExtractorError(error_msg, expected=True) - info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id) + info_url = 'http://link.theplatform.com/s/{0}/{1}?format=preview'.format(provider_id, video_id) info_json = self._download_webpage(info_url, video_id) info = json.loads(info_json) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 504a711936..16babf6a58 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -138,7 +138,7 @@ class FFmpegPostProcessor(PostProcessor): if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode != 0: stderr = stderr.decode('utf-8', 'replace') @@ -178,8 +178,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): encodeArgument('-show_streams'), encodeFilename(self._ffmpeg_filename_argument(path), True)] if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] ffprobe command line: %s' % shell_quote(cmd)) - handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE) + self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) + handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) output = handle.communicate()[0] if handle.wait() != 0: return None