From 359df0fc423b4a5d5af8113d42648fdea22e81ea Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Sat, 27 Nov 2021 07:51:32 +0100
Subject: [PATCH] [nebula] Add NebulaCollectionIE and rewrite extractor (#1694)

Closes #1690
Authored by: hheimbuerger
---
 yt_dlp/extractor/extractors.py |   5 +-
 yt_dlp/extractor/nebula.py     | 374 +++++++++++++++++++--------------
 2 files changed, 217 insertions(+), 162 deletions(-)

diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index a277bf7226..2fb9515c0b 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -889,7 +889,10 @@ from .ndr import (
     NJoyEmbedIE,
 )
 from .ndtv import NDTVIE
-from .nebula import NebulaIE
+from .nebula import (
+    NebulaIE,
+    NebulaCollectionIE,
+)
 from .nerdcubed import NerdCubedFeedIE
 from .netzkino import NetzkinoIE
 from .neteasemusic import (
diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py
index 9698a358e1..d235805c35 100644
--- a/yt_dlp/extractor/nebula.py
+++ b/yt_dlp/extractor/nebula.py
@@ -1,22 +1,163 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import itertools
 import json
 import time
+import urllib
 
-from urllib.error import HTTPError
-from .common import InfoExtractor
-from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote
 from ..utils import (
     ExtractorError,
     parse_iso8601,
     try_get,
-    urljoin,
 )
+from .common import InfoExtractor
 
 
-class NebulaIE(InfoExtractor):
+class NebulaBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'watchnebula'
 
+    _nebula_api_token = None
+    _nebula_bearer_token = None
+    _zype_access_token = None
+
+    def _perform_nebula_auth(self):
+        username, password = self._get_login_info()
+        if not (username and password):
+            self.raise_login_required()
+
+        data = json.dumps({'email': username, 'password': password}).encode('utf8')
+        response = self._download_json(
+            'https://api.watchnebula.com/api/v1/auth/login/',
+            data=data, fatal=False, video_id=None,
+            headers={
+                'content-type': 'application/json',
+                # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
+                'cookie': ''
+            },
+            note='Logging in to Nebula with supplied credentials',
+            errnote='Authentication failed or rejected')
+        if not response or not response.get('key'):
+            self.raise_login_required()
+
+        # save nebula token as cookie
+        self._set_cookie(
+            'nebula.app', 'nebula-auth',
+            urllib.parse.quote(
+                json.dumps({
+                    "apiToken": response["key"],
+                    "isLoggingIn": False,
+                    "isLoggingOut": False,
+                }, separators=(",", ":"))),
+            expire_time=int(time.time()) + 86400 * 365,
+        )
+
+        return response['key']
+
+    def _retrieve_nebula_api_token(self):
+        """
+        Check cookie jar for valid token. Try to authenticate using credentials if no valid token
+        can be found in the cookie jar.
+        """
+        nebula_cookies = self._get_cookies('https://nebula.app')
+        nebula_cookie = nebula_cookies.get('nebula-auth')
+        if nebula_cookie:
+            self.to_screen('Authenticating to Nebula with token from cookie jar')
+            nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value)
+            nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
+            if nebula_api_token:
+                return nebula_api_token
+
+        return self._perform_nebula_auth()
+
+    def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
+        assert method in ('GET', 'POST',)
+        assert auth_type in ('api', 'bearer',)
+
+        def inner_call():
+            authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
+            return self._download_json(
+                url, video_id, note=note, headers={'Authorization': authorization},
+                data=b'' if method == 'POST' else None)
+
+        try:
+            return inner_call()
+        except ExtractorError as exc:
+            # if 401 or 403, attempt credential re-auth and retry
+            if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
+                self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
+                self._login()
+                return inner_call()
+            else:
+                raise
+
+    def _fetch_nebula_bearer_token(self):
+        """
+        Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
+        """
+        response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
+                                         method='POST',
+                                         note='Authorizing to Nebula')
+        return response['token']
+
+    def _fetch_zype_access_token(self):
+        """
+        Get a Zype access token, which is required to access video streams -- in our case: to
+        generate video URLs.
+        """
+        user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token')
+
+        access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str)
+        if not access_token:
+            if try_get(user_object, lambda x: x['is_subscribed'], bool):
+                # TODO: Reimplement the same Zype token polling the Nebula frontend implements
+                # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
+                raise ExtractorError(
+                    'Unable to extract Zype access token from Nebula API authentication endpoint. '
+                    'Open an arbitrary video in a browser with this account to generate a token',
+                    expected=True)
+            raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
+        return access_token
+
+    def _build_video_info(self, episode):
+        zype_id = episode['zype_id']
+        zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}'
+        channel_slug = episode['channel_slug']
+        return {
+            'id': episode['zype_id'],
+            'display_id': episode['slug'],
+            '_type': 'url_transparent',
+            'ie_key': 'Zype',
+            'url': zype_video_url,
+            'title': episode['title'],
+            'description': episode['description'],
+            'timestamp': parse_iso8601(episode['published_at']),
+            'thumbnails': [{
+                # 'id': tn.get('name'),  # this appears to be null
+                'url': tn['original'],
+                'height': key,
+            } for key, tn in episode['assets']['thumbnail'].items()],
+            'duration': episode['duration'],
+            'channel': episode['channel_title'],
+            'channel_id': channel_slug,
+            'channel_url': f'https://nebula.app/{channel_slug}',
+            'uploader': episode['channel_title'],
+            'uploader_id': channel_slug,
+            'uploader_url': f'https://nebula.app/{channel_slug}',
+            'series': episode['channel_title'],
+            'creator': episode['channel_title'],
+        }
+
+    def _login(self):
+        self._nebula_api_token = self._retrieve_nebula_api_token()
+        self._nebula_bearer_token = self._fetch_nebula_bearer_token()
+        self._zype_access_token = self._fetch_zype_access_token()
+
+    def _real_initialize(self):
+        self._login()
+
+
+class NebulaIE(NebulaBaseIE):
     _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)'
     _TESTS = [
         {
@@ -30,12 +171,13 @@ class NebulaIE(InfoExtractor):
                 'upload_date': '20180731',
                 'timestamp': 1533009600,
                 'channel': 'Lindsay Ellis',
+                'channel_id': 'lindsayellis',
                 'uploader': 'Lindsay Ellis',
+                'uploader_id': 'lindsayellis',
             },
             'params': {
                 'usenetrc': True,
             },
-            'skip': 'All Nebula content requires authentication',
         },
         {
             'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
@@ -47,13 +189,14 @@ class NebulaIE(InfoExtractor):
                 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
                 'upload_date': '20200327',
                 'timestamp': 1585348140,
-                'channel': 'The Logistics of D-Day',
-                'uploader': 'The Logistics of D-Day',
+                'channel': 'Real Engineering',
+                'channel_id': 'realengineering',
+                'uploader': 'Real Engineering',
+                'uploader_id': 'realengineering',
             },
             'params': {
                 'usenetrc': True,
             },
-            'skip': 'All Nebula content requires authentication',
         },
         {
             'url': 'https://nebula.app/videos/money-episode-1-the-draw',
@@ -66,173 +209,82 @@ class NebulaIE(InfoExtractor):
                 'upload_date': '20200323',
                 'timestamp': 1584980400,
                 'channel': 'Tom Scott Presents: Money',
+                'channel_id': 'tom-scott-presents-money',
                 'uploader': 'Tom Scott Presents: Money',
+                'uploader_id': 'tom-scott-presents-money',
             },
             'params': {
                 'usenetrc': True,
             },
-            'skip': 'All Nebula content requires authentication',
         },
         {
             'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
             'only_matching': True,
         },
     ]
-    _NETRC_MACHINE = 'watchnebula'
 
-    _nebula_token = None
-
-    def _retrieve_nebula_auth(self):
-        """
-        Log in to Nebula, and returns a Nebula API token
-        """
-
-        username, password = self._get_login_info()
-        if not (username and password):
-            self.raise_login_required()
-
-        self.report_login()
-        data = json.dumps({'email': username, 'password': password}).encode('utf8')
-        response = self._download_json(
-            'https://api.watchnebula.com/api/v1/auth/login/',
-            data=data, fatal=False, video_id=None,
-            headers={
-                'content-type': 'application/json',
-                # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
-                'cookie': ''
-            },
-            note='Authenticating to Nebula with supplied credentials',
-            errnote='Authentication failed or rejected')
-        if not response or not response.get('key'):
-            self.raise_login_required()
-
-        # save nebula token as cookie
-        self._set_cookie(
-            'nebula.app', 'nebula-auth',
-            compat_urllib_parse_quote(
-                json.dumps({
-                    "apiToken": response["key"],
-                    "isLoggingIn": False,
-                    "isLoggingOut": False,
-                }, separators=(",", ":"))),
-            expire_time=int(time.time()) + 86400 * 365,
-        )
-
-        return response['key']
-
-    def _retrieve_zype_api_key(self, page_url, display_id):
-        """
-        Retrieves the Zype API key
-        """
-
-        # Find the js that has the API key from the webpage and download it
-        webpage = self._download_webpage(page_url, video_id=display_id)
-        main_script_relpath = self._search_regex(
-            r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage,
-            group='script_relpath', name='script relative path', fatal=True)
-        main_script_abspath = urljoin(page_url, main_script_relpath)
-        main_script = self._download_webpage(main_script_abspath, video_id=display_id,
-                                             note='Retrieving Zype API key')
-
-        api_key = self._search_regex(
-            r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script,
-            group='api_key', name='API key', fatal=True)
-
-        return api_key
-
-    def _call_zype_api(self, path, params, video_id, api_key, note):
-        """
-        A helper for making calls to the Zype API.
-        """
-        query = {'api_key': api_key, 'per_page': 1}
-        query.update(params)
-        return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)
-
-    def _call_nebula_api(self, path, video_id, access_token, note):
-        """
-        A helper for making calls to the Nebula API.
-        """
-        return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={
-            'Authorization': 'Token {access_token}'.format(access_token=access_token)
-        }, note=note)
-
-    def _fetch_zype_access_token(self, video_id):
-        try:
-            user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
-        except ExtractorError as exc:
-            # if 401, attempt credential auth and retry
-            if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401:
-                self._nebula_token = self._retrieve_nebula_auth()
-                user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
-            else:
-                raise
-
-        access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
-        if not access_token:
-            if try_get(user_object, lambda x: x['is_subscribed'], bool):
-                # TODO: Reimplement the same Zype token polling the Nebula frontend implements
-                # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
-                raise ExtractorError(
-                    'Unable to extract Zype access token from Nebula API authentication endpoint. '
-                    'Open an arbitrary video in a browser with this account to generate a token',
-                    expected=True)
-            raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
-        return access_token
-
-    def _extract_channel_title(self, video_meta):
-        # TODO: Implement the API calls giving us the channel list,
-        # so that we can do the title lookup and then figure out the channel URL
-        categories = video_meta.get('categories', []) if video_meta else []
-        # the channel name is the value of the first category
-        for category in categories:
-            if category.get('value'):
-                return category['value'][0]
-
-    def _real_initialize(self):
-        # check cookie jar for valid token
-        nebula_cookies = self._get_cookies('https://nebula.app')
-        nebula_cookie = nebula_cookies.get('nebula-auth')
-        if nebula_cookie:
-            self.to_screen('Authenticating to Nebula with token from cookie jar')
-            nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)
-            self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
-
-        # try to authenticate using credentials if no valid token has been found
-        if not self._nebula_token:
-            self._nebula_token = self._retrieve_nebula_auth()
+    def _fetch_video_metadata(self, slug):
+        return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
+                                     video_id=slug,
+                                     auth_type='bearer',
+                                     note='Fetching video meta data')
 
     def _real_extract(self, url):
-        display_id = self._match_id(url)
-        api_key = self._retrieve_zype_api_key(url, display_id)
+        slug = self._match_id(url)
+        video = self._fetch_video_metadata(slug)
+        return self._build_video_info(video)
 
-        response = self._call_zype_api('/videos', {'friendly_title': display_id},
-                                       display_id, api_key, note='Retrieving metadata from Zype')
-        if len(response.get('response') or []) != 1:
-            raise ExtractorError('Unable to find video on Zype API')
-        video_meta = response['response'][0]
 
-        video_id = video_meta['_id']
-        zype_access_token = self._fetch_zype_access_token(display_id)
+class NebulaCollectionIE(NebulaBaseIE):
+    IE_NAME = 'nebula:collection'
+    _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)'
+    _TESTS = [
+        {
+            'url': 'https://nebula.app/tom-scott-presents-money',
+            'info_dict': {
+                'id': 'tom-scott-presents-money',
+                'title': 'Tom Scott Presents: Money',
+                'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
+            },
+            'playlist_count': 5,
+            'params': {
+                'usenetrc': True,
+            },
+        }, {
+            'url': 'https://nebula.app/lindsayellis',
+            'info_dict': {
+                'id': 'lindsayellis',
+                'title': 'Lindsay Ellis',
+                'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
+            },
+            'playlist_mincount': 100,
+            'params': {
+                'usenetrc': True,
+            },
+        },
+    ]
 
-        channel_title = self._extract_channel_title(video_meta)
+    def _generate_playlist_entries(self, collection_id, channel):
+        episodes = channel['episodes']['results']
+        for page_num in itertools.count(2):
+            for episode in episodes:
+                yield self._build_video_info(episode)
+            next_url = channel['episodes']['next']
+            if not next_url:
+                break
+            channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
+                                            note=f'Retrieving channel page {page_num}')
+            episodes = channel['episodes']['results']
 
-        return {
-            'id': video_id,
-            'display_id': display_id,
-            '_type': 'url_transparent',
-            'ie_key': 'Zype',
-            'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token),
-            'title': video_meta.get('title'),
-            'description': video_meta.get('description'),
-            'timestamp': parse_iso8601(video_meta.get('published_at')),
-            'thumbnails': [{
-                'id': tn.get('name'),  # this appears to be null
-                'url': tn['url'],
-                'width': tn.get('width'),
-                'height': tn.get('height'),
-            } for tn in video_meta.get('thumbnails', [])],
-            'duration': video_meta.get('duration'),
-            'channel': channel_title,
-            'uploader': channel_title,  # we chose uploader = channel name
-            # TODO: uploader_url, channel_id, channel_url
-        }
+    def _real_extract(self, url):
+        collection_id = self._match_id(url)
+        channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
+        channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
+        channel_details = channel['details']
+
+        return self.playlist_result(
+            entries=self._generate_playlist_entries(collection_id, channel),
+            playlist_id=collection_id,
+            playlist_title=channel_details['title'],
+            playlist_description=channel_details['description']
+        )