[extractor/rtl.lu] Add extractor (#4222)

Closes #1721
Authored by: HobbyistDev
This commit is contained in:
HobbyistDev 2022-07-14 15:24:27 +09:00 committed by GitHub
parent eb2333bce1
commit 5f2da312fa
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 159 additions and 1 deletions

View file

@ -1444,7 +1444,13 @@ from .rottentomatoes import RottenTomatoesIE
from .rozhlas import RozhlasIE from .rozhlas import RozhlasIE
from .rtbf import RTBFIE from .rtbf import RTBFIE
from .rte import RteIE, RteRadioIE from .rte import RteIE, RteRadioIE
from .rtlnl import RtlNlIE from .rtlnl import (
RtlNlIE,
RTLLuTeleVODIE,
RTLLuArticleIE,
RTLLuLiveIE,
RTLLuRadioIE,
)
from .rtl2 import ( from .rtl2 import (
RTL2IE, RTL2IE,
RTL2YouIE, RTL2YouIE,

View file

@ -141,3 +141,155 @@ class RtlNlIE(InfoExtractor):
'duration': parse_duration(material.get('duration')), 'duration': parse_duration(material.get('duration')),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
} }
class RTLLuBaseIE(InfoExtractor):
_MEDIA_REGEX = {
'video': r'<rtl-player\s[^>]*\bhls\s*=\s*"([^"]+)',
'audio': r'<rtl-audioplayer\s[^>]*\bsrc\s*=\s*"([^"]+)',
'thumbnail': r'<rtl-player\s[^>]*\bposter\s*=\s*"([^"]+)',
}
def get_media_url(self, webpage, video_id, media_type):
return self._search_regex(self._MEDIA_REGEX[media_type], webpage, f'{media_type} url', default=None)
def get_formats_and_subtitles(self, webpage, video_id):
video_url, audio_url = self.get_media_url(webpage, video_id, 'video'), self.get_media_url(webpage, video_id, 'audio')
formats, subtitles = [], {}
if video_url is not None:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id)
if audio_url is not None:
formats.append({'url': audio_url, 'ext': 'mp3', 'vcodec': 'none'})
return formats, subtitles
def _real_extract(self, url):
video_id = self._match_id(url)
is_live = video_id in ('live', 'live-2', 'lauschteren')
# TODO: extract comment from https://www.rtl.lu/comments?status=1&order=desc&context=news|article|<video_id>
# we can context from <rtl-comments context=<context> in webpage
webpage = self._download_webpage(url, video_id)
formats, subtitles = self.get_formats_and_subtitles(webpage, video_id)
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage, default=None),
'formats': formats,
'subtitles': subtitles,
'thumbnail': self.get_media_url(webpage, video_id, 'thumbnail') or self._og_search_thumbnail(webpage, default=None),
'is_live': is_live,
}
class RTLLuTeleVODIE(RTLLuBaseIE):
IE_NAME = 'rtl.lu:tele-vod'
_VALID_URL = r'https?://(?:www\.)?rtl\.lu/(tele/(?P<slug>[\w-]+)/v/|video/)(?P<id>\d+)(\.html)?'
_TESTS = [{
'url': 'https://www.rtl.lu/tele/de-journal-vun-der-tele/v/3266757.html',
'info_dict': {
'id': '3266757',
'title': 'Informatiounsversammlung Héichwaasser',
'ext': 'mp4',
'thumbnail': 'https://replay-assets.rtl.lu/2021/11/16/d3647fc4-470d-11ec-adc2-3a00abd6e90f_00008.jpg',
'description': 'md5:b1db974408cc858c9fd241812e4a2a14',
}
}, {
'url': 'https://www.rtl.lu/video/3295215',
'info_dict': {
'id': '3295215',
'title': 'Kulturassisen iwwer d\'Bestandsopnam vum Lëtzebuerger Konscht',
'ext': 'mp4',
'thumbnail': 'https://replay-assets.rtl.lu/2022/06/28/0000_3295215_0000.jpg',
'description': 'md5:85bcd4e0490aa6ec969d9bf16927437b',
}
}]
class RTLLuArticleIE(RTLLuBaseIE):
IE_NAME = 'rtl.lu:article'
_VALID_URL = r'https?://(?:(www|5minutes|today)\.)rtl\.lu/(?:[\w-]+)/(?:[\w-]+)/a/(?P<id>\d+)\.html'
_TESTS = [{
# Audio-only
'url': 'https://www.rtl.lu/sport/news/a/1934360.html',
'info_dict': {
'id': '1934360',
'ext': 'mp3',
'thumbnail': 'https://static.rtl.lu/rtl2008.lu/nt/p/2022/06/28/19/e4b37d66ddf00bab4c45617b91a5bb9b.jpeg',
'description': 'md5:5eab4a2a911c1fff7efc1682a38f9ef7',
'title': 'md5:40aa85f135578fbd549d3c9370321f99',
}
}, {
# 5minutes
'url': 'https://5minutes.rtl.lu/espace-frontaliers/frontaliers-en-questions/a/1853173.html',
'info_dict': {
'id': '1853173',
'ext': 'mp4',
'description': 'md5:ac031da0740e997a5cf4633173634fee',
'title': 'md5:87e17722ed21af0f24be3243f4ec0c46',
'thumbnail': 'https://replay-assets.rtl.lu/2022/01/26/screenshot_20220126104933_3274749_12b249833469b0d6e4440a1dec83cdfa.jpg',
}
}, {
# today.lu
'url': 'https://today.rtl.lu/entertainment/news/a/1936203.html',
'info_dict': {
'id': '1936203',
'ext': 'mp4',
'title': 'Once Upon A Time...zu Lëtzebuerg: The Three Witches\' Tower',
'description': 'The witchy theme continues in the latest episode of Once Upon A Time...',
'thumbnail': 'https://replay-assets.rtl.lu/2022/07/02/screenshot_20220702122859_3290019_412dc5185951b7f6545a4039c8be9235.jpg',
}
}]
class RTLLuLiveIE(RTLLuBaseIE):
_VALID_URL = r'https?://www\.rtl\.lu/(?:tele|radio)/(?P<id>live(?:-\d+)?|lauschteren)'
_TESTS = [{
# Tele:live
'url': 'https://www.rtl.lu/tele/live',
'info_dict': {
'id': 'live',
'ext': 'mp4',
'live_status': 'is_live',
'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'thumbnail': 'https://static.rtl.lu/livestream/channel1.jpg',
}
}, {
# Tele:live-2
'url': 'https://www.rtl.lu/tele/live-2',
'info_dict': {
'id': 'live-2',
'ext': 'mp4',
'live_status': 'is_live',
'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'thumbnail': 'https://static.rtl.lu/livestream/channel2.jpg',
}
}, {
# Radio:lauschteren
'url': 'https://www.rtl.lu/radio/lauschteren',
'info_dict': {
'id': 'lauschteren',
'ext': 'mp4',
'live_status': 'is_live',
'title': r're:RTL - Radio LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'thumbnail': 'https://static.rtl.lu/livestream/rtlradiowebtv.jpg',
}
}]
class RTLLuRadioIE(RTLLuBaseIE):
_VALID_URL = r'https?://www\.rtl\.lu/radio/(?:[\w-]+)/s/(?P<id>\d+)(\.html)?'
_TESTS = [{
'url': 'https://www.rtl.lu/radio/5-vir-12/s/4033058.html',
'info_dict': {
'id': '4033058',
'ext': 'mp3',
'description': 'md5:f855a4f3e3235393ae47ed1db5d934b9',
'title': '5 vir 12 - Stau um Stau',
'thumbnail': 'https://static.rtl.lu/rtlg//2022/06/24/c9c19e5694a14be46a3647a3760e1f62.jpg',
}
}]