[youtube] Add support for downloading top lists (fixes #1868)

It needs to know the channel and the title of the list, because the ids change every time you browse the channels and are attached to a 'VISITOR_INFO1_LIVE' cookie.
This commit is contained in:
Jaime Marquínez Ferrándiz 2013-11-30 14:56:51 +01:00
parent b138de72f2
commit 0a688bc0b2
3 changed files with 44 additions and 0 deletions

View file

@ -15,6 +15,7 @@ from youtube_dl.extractor import (
YoutubeIE,
YoutubeChannelIE,
YoutubeShowIE,
YoutubeTopListIE,
)
@ -116,5 +117,12 @@ class TestYoutubeLists(unittest.TestCase):
original_video = entries[0]
self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
def test_youtube_toplist(self):
dl = FakeYDL()
ie = YoutubeTopListIE(dl)
result = ie.extract('yttoplist:music:Top Tracks')
entries = result['entries']
self.assertTrue(len(entries) >= 9)
if __name__ == '__main__':
unittest.main()

View file

@ -194,6 +194,7 @@ from .youtube import (
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeTopListIE,
)
from .zdf import ZDFIE

View file

@ -1576,6 +1576,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
if len(playlist_id) == 13: # 'RD' + 11 characters for the video id
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
if playlist_id.startswith('TL'):
raise ExtractorError(u'For downloading YouTube.com top lists, use '
u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
# Extract the video ids from the playlist pages
ids = []
@ -1598,6 +1601,38 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self.playlist_result(url_results, playlist_id, playlist_title)
class YoutubeTopListIE(YoutubePlaylistIE):
IE_NAME = u'youtube:toplist'
IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
u' (Example: "yttoplist:music:Top Tracks")')
_VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
channel = mobj.group('chann')
title = mobj.group('title')
query = compat_urllib_parse.urlencode({'title': title})
playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
link = self._html_search_regex(playlist_re, channel_page, u'list')
url = compat_urlparse.urljoin('https://www.youtube.com/', link)
video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
ids = []
# sometimes the webpage doesn't contain the videos
# retry until we get them
for i in itertools.count(0):
msg = u'Downloading Youtube mix'
if i > 0:
msg += ', retry #%d' % i
webpage = self._download_webpage(url, title, msg)
ids = orderedSet(re.findall(video_re, webpage))
if ids:
break
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_title=title)
class YoutubeChannelIE(InfoExtractor):
IE_DESC = u'YouTube.com channels'
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"