[ant1newsgr] Add extractor (#1982)

Authored by: zmousm
This commit is contained in:
Zenon Mousmoulas 2022-03-04 23:52:48 +02:00 committed by GitHub
parent 50e93e03a7
commit 27231526ae
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 181 additions and 19 deletions

View file

@ -0,0 +1,143 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
HEADRequest,
ExtractorError,
determine_ext,
scale_thumbnails_to_max_format_width,
unescapeHTML,
)
class Ant1NewsGrBaseIE(InfoExtractor):
def _download_and_extract_api_data(self, video_id, netloc, cid=None):
url = f'{self.http_scheme()}//{netloc}{self._API_PATH}'
info = self._download_json(url, video_id, query={'cid': cid or video_id})
try:
source = info['url']
except KeyError:
raise ExtractorError('no source found for %s' % video_id)
formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
if determine_ext(source) == 'm3u8' else ([{'url': source}], {}))
self._sort_formats(formats)
thumbnails = scale_thumbnails_to_max_format_width(
formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+')
return {
'id': video_id,
'title': info.get('title'),
'thumbnails': thumbnails,
'formats': formats,
'subtitles': subs,
}
class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:watch'
IE_DESC = 'ant1news.gr videos'
_VALID_URL = r'https?://(?P<netloc>(?:www\.)?ant1news\.gr)/watch/(?P<id>\d+)/'
_API_PATH = '/templates/data/player'
_TESTS = [{
'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45',
'md5': '95925e6b32106754235f2417e0d2dfab',
'info_dict': {
'id': '1506168',
'ext': 'mp4',
'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a',
'description': 'md5:18665af715a6dcfeac1d6153a44f16b0',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg',
},
}]
def _real_extract(self, url):
video_id, netloc = self._match_valid_url(url).group('id', 'netloc')
webpage = self._download_webpage(url, video_id)
info = self._download_and_extract_api_data(video_id, netloc)
info['description'] = self._og_search_description(webpage)
return info
class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:article'
IE_DESC = 'ant1news.gr articles'
_VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/'
_TESTS = [{
'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
'md5': '294f18331bb516539d72d85a82887dcc',
'info_dict': {
'id': '_xvg/m_cmbatw=',
'ext': 'mp4',
'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
'timestamp': 1603092840,
'upload_date': '20201019',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
},
}, {
'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
'info_dict': {
'id': '620286',
'title': 'md5:91fe569e952e4d146485740ae927662b',
},
'playlist_mincount': 2,
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
if not embed_urls:
raise ExtractorError('no videos found for %s' % video_id, expected=True)
return self.url_result_or_playlist_from_matches(
embed_urls, video_id, info['title'], ie=Ant1NewsGrEmbedIE.ie_key(),
video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')})
class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:embed'
IE_DESC = 'ant1news.gr embedded videos'
_BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
_VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
_API_PATH = '/news/templates/data/jsonPlayer'
_TESTS = [{
'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',
'md5': 'dfc58c3a11a5a9aad2ba316ed447def3',
'info_dict': {
'id': '3f_li_c_az_jw_y_u=',
'ext': 'mp4',
'title': 'md5:a30c93332455f53e1e84ae0724f0adf7',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg',
},
}]
@classmethod
def _extract_urls(cls, webpage):
_EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
_EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
for mobj in re.finditer(_EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if not cls.suitable(url):
continue
yield url
def _real_extract(self, url):
video_id = self._match_id(url)
canonical_url = self._request_webpage(
HEADRequest(url), video_id,
note='Resolve canonical player URL',
errnote='Could not resolve canonical player URL').geturl()
_, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url)
cid = urllib.parse.parse_qs(query)['cid'][0]
return self._download_and_extract_api_data(video_id, netloc, cid=cid)

View file

@ -1140,8 +1140,8 @@ class InfoExtractor(object):
'url': url,
}
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, **kwargs):
urls = (self.url_result(self._proto_relative_url(m), ie)
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
for m in orderedSet(map(getter, matches) if getter else matches))
return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)

View file

@ -1401,6 +1401,11 @@ from .megatvcom import (
MegaTVComIE,
MegaTVComEmbedIE,
)
from .ant1newsgr import (
Ant1NewsGrWatchIE,
Ant1NewsGrArticleIE,
Ant1NewsGrEmbedIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import (

View file

@ -103,6 +103,7 @@ from .videopress import VideoPressIE
from .rutube import RutubeIE
from .glomex import GlomexEmbedIE
from .megatvcom import MegaTVComEmbedIE
from .ant1newsgr import Ant1NewsGrEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@ -3544,6 +3545,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
# Look for ant1news.gr embeds
ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
if ant1newsgr_urls:
return self.playlist_from_matches(
ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key())
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls:

View file

@ -7,7 +7,7 @@ from .common import InfoExtractor
from ..utils import (
determine_ext,
get_elements_text_and_html_by_attribute,
merge_dicts,
scale_thumbnails_to_max_format_width,
unescapeHTML,
)
@ -78,21 +78,6 @@ class TVOpenGrWatchIE(TVOpenGrBaseIE):
self._sort_formats(formats)
return formats, subs
@staticmethod
def _scale_thumbnails_to_max_width(formats, thumbnails, url_width_re):
_keys = ('width', 'height')
max_dimensions = max(
[tuple(format.get(k) or 0 for k in _keys) for format in formats],
default=(0, 0))
if not max_dimensions[0]:
return thumbnails
return [
merge_dicts(
{'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
dict(zip(_keys, max_dimensions)), thumbnail)
for thumbnail in thumbnails
]
def _real_extract(self, url):
netloc, video_id, display_id = self._match_valid_url(url).group('netloc', 'id', 'slug')
if netloc.find('tvopen.gr') == -1:
@ -102,7 +87,7 @@ class TVOpenGrWatchIE(TVOpenGrBaseIE):
info['formats'], info['subtitles'] = self._extract_formats_and_subs(
self._download_json(self._API_ENDPOINT, video_id, query={'cid': video_id}),
video_id)
info['thumbnails'] = self._scale_thumbnails_to_max_width(
info['thumbnails'] = scale_thumbnails_to_max_format_width(
info['formats'], info['thumbnails'], r'(?<=/imgHandler/)\d+')
description, _html = next(get_elements_text_and_html_by_attribute('class', 'description', webpage))
if description and _html.startswith('<span '):

View file

@ -5271,6 +5271,28 @@ def join_nonempty(*values, delim='-', from_dict=None):
return delim.join(map(str, filter(None, values)))
def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
"""
Find the largest format dimensions in terms of video width and, for each thumbnail:
* Modify the URL: Match the width with the provided regex and replace with the former width
* Update dimensions
This function is useful with video services that scale the provided thumbnails on demand
"""
_keys = ('width', 'height')
max_dimensions = max(
[tuple(format.get(k) or 0 for k in _keys) for format in formats],
default=(0, 0))
if not max_dimensions[0]:
return thumbnails
return [
merge_dicts(
{'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
dict(zip(_keys, max_dimensions)), thumbnail)
for thumbnail in thumbnails
]
def parse_http_range(range):
""" Parse value of "Range" or "Content-Range" HTTP header into tuple. """
if not range: