mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-12-14 07:12:41 +00:00
[NBC] Enhance embedURL extraction (closes #2549)
This commit is contained in:
parent
dc1eed93be
commit
0fe2ff78e6
3 changed files with 22 additions and 2 deletions
|
@ -53,6 +53,7 @@ from youtube_dl.utils import (
|
|||
unified_strdate,
|
||||
unsmuggle_url,
|
||||
uppercase_escape,
|
||||
lowercase_escape,
|
||||
url_basename,
|
||||
urlencode_postdata,
|
||||
version_tuple,
|
||||
|
@ -418,6 +419,10 @@ class TestUtil(unittest.TestCase):
|
|||
self.assertEqual(uppercase_escape('aä'), 'aä')
|
||||
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
|
||||
|
||||
def test_lowercase_escape(self):
|
||||
self.assertEqual(lowercase_escape('aä'), 'aä')
|
||||
self.assertEqual(lowercase_escape('\\u0026'), '&')
|
||||
|
||||
def test_limit_length(self):
|
||||
self.assertEqual(limit_length(None, 12), None)
|
||||
self.assertEqual(limit_length('foo', 12), 'foo')
|
||||
|
|
|
@ -10,6 +10,8 @@ from ..compat import (
|
|||
from ..utils import (
|
||||
ExtractorError,
|
||||
find_xpath_attr,
|
||||
lowercase_escape,
|
||||
unescapeHTML,
|
||||
)
|
||||
|
||||
|
||||
|
@ -46,18 +48,23 @@ class NBCIE(InfoExtractor):
|
|||
'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
|
||||
},
|
||||
'skip': 'Only works from US',
|
||||
},
|
||||
{
|
||||
# This video has expired but with an escaped embedURL
|
||||
'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
|
||||
'skip': 'Expired'
|
||||
}
|
||||
]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
theplatform_url = self._search_regex(
|
||||
theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
|
||||
[
|
||||
r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
|
||||
r'"embedURL"\s*:\s*"([^"]+)"'
|
||||
],
|
||||
webpage, 'theplatform url').replace('_no_endcard', '')
|
||||
webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
|
||||
if theplatform_url.startswith('//'):
|
||||
theplatform_url = 'http:' + theplatform_url
|
||||
return self.url_result(theplatform_url)
|
||||
|
|
|
@ -1486,6 +1486,14 @@ def uppercase_escape(s):
|
|||
s)
|
||||
|
||||
|
||||
def lowercase_escape(s):
|
||||
unicode_escape = codecs.getdecoder('unicode_escape')
|
||||
return re.sub(
|
||||
r'\\u[0-9a-fA-F]{4}',
|
||||
lambda m: unicode_escape(m.group(0))[0],
|
||||
s)
|
||||
|
||||
|
||||
def escape_rfc3986(s):
|
||||
"""Escape non-ASCII characters as suggested by RFC 3986"""
|
||||
if sys.version_info < (3, 0) and isinstance(s, compat_str):
|
||||
|
|
Loading…
Reference in a new issue