diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6b5037c8c1..90f1a4418b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,6 +59,7 @@ from .myvideo import MyVideoIE from .nba import NBAIE from .nbc import NBCNewsIE from .ooyala import OoyalaIE +from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py new file mode 100644 index 0000000000..8da0a2c8ea --- /dev/null +++ b/youtube_dl/extractor/orf.py @@ -0,0 +1,65 @@ +import re +import xml.etree.ElementTree +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + ExtractorError, + find_xpath_attr, +) + +class ORFIE(InfoExtractor): + _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P\d+)' + + _TEST = { + u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter', + u'file': u'6566957.flv', + u'info_dict': { + u'title': u'Wetter', + u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at', + }, + u'params': { + # It uses rtmp + u'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + webpage = self._download_webpage(url, playlist_id) + + flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') + flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] + flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) + playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') + playlist = json.loads(playlist_json) + + videos = [] + ns = '{http://tempuri.org/XMLSchema.xsd}' + xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} + webpage_description = self._og_search_description(webpage) + for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): + # Get best quality url + rtmp_url = None + for q in ['Q6A', 'Q4A', 'Q1A']: + video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) + if video_url is not None: + rtmp_url = video_url.text + break + if rtmp_url is None: + raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) + description = self._html_search_regex( + r'id="playlist_entry_%s".*?

(.*?)

' % i, webpage, + u'description', default=webpage_description, flags=re.DOTALL) + videos.append({ + '_type': 'video', + 'id': info['id'], + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': description, + }) + + return videos diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b3d0f64ea9..201802cee6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -213,7 +213,7 @@ if sys.version_info >= (2,7): def find_xpath_attr(node, xpath, key, val): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z]+$', key) - assert re.match(r'^[a-zA-Z@\s]*$', val) + assert re.match(r'^[a-zA-Z0-9@\s]*$', val) expr = xpath + u"[@%s='%s']" % (key, val) return node.find(expr) else: