From 81da8cbc4513df16d0d04dc2992d6de9ab0f4038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 22:05:25 +0600 Subject: [PATCH] [udemy] Switch to api 2.0 (Closes #9035) --- youtube_dl/extractor/udemy.py | 60 +++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 71bea5363e..2e54dbc112 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -17,6 +17,7 @@ from ..utils import ( int_or_none, sanitized_Request, unescapeHTML, + update_url_query, urlencode_postdata, ) @@ -54,6 +55,16 @@ class UdemyIE(InfoExtractor): 'only_matching': True, }] + def _extract_course_info(self, webpage, video_id): + course = self._parse_json( + unescapeHTML(self._search_regex( + r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')), + video_id, fatal=False) or {} + course_id = course.get('id') or self._search_regex( + (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'), + webpage, 'course id') + return course_id, course.get('title') + def _enroll_course(self, base_url, webpage, course_id): def combine_url(base_url, url): return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url @@ -98,7 +109,7 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): + def _download_json(self, url_or_request, *args, **kwargs): headers = { 'X-Udemy-Snail-Case': 'true', 'X-Requested-With': 'XMLHttpRequest', @@ -116,7 +127,7 @@ class UdemyIE(InfoExtractor): else: url_or_request = sanitized_Request(url_or_request, headers=headers) - response = super(UdemyIE, self)._download_json(url_or_request, video_id, note) + response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) self._handle_error(response) return response @@ -166,9 +177,7 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) - course_id = self._search_regex( - (r'data-course-id=["\'](\d+)', r'"id"\s*:\s*(\d+)'), - webpage, 'course id') + course_id, _ = self._extract_course_info(webpage, lecture_id) try: lecture = self._download_lecture(course_id, lecture_id) @@ -309,29 +318,32 @@ class UdemyCourseIE(UdemyIE): webpage = self._download_webpage(url, course_path) - response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s' % course_path, - course_path, 'Downloading course JSON') - - course_id = response['id'] - course_title = response.get('title') + course_id, title = self._extract_course_info(webpage, course_path) self._enroll_course(url, webpage, course_id) + course_url = update_url_query( + 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, + { + 'fields[chapter]': 'title,object_index', + 'fields[lecture]': 'title', + 'page_size': '1000', + }) + response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, - course_id, 'Downloading course curriculum') + course_url, course_id, 'Downloading course curriculum') entries = [] - chapter, chapter_number = None, None - for asset in response: - asset_type = asset.get('assetType') or asset.get('asset_type') - if asset_type == 'Video': - asset_id = asset.get('id') - if asset_id: + chapter, chapter_number = [None] * 2 + for entry in response['results']: + clazz = entry.get('_class') + if clazz == 'lecture': + lecture_id = entry.get('id') + if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), + 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, entry['id']), + 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } if chapter_number: @@ -339,8 +351,8 @@ class UdemyCourseIE(UdemyIE): if chapter: entry['chapter'] = chapter entries.append(entry) - elif asset.get('type') == 'chapter': - chapter_number = asset.get('index') or asset.get('object_index') - chapter = asset.get('title') + elif clazz == 'chapter': + chapter_number = entry.get('object_index') + chapter = entry.get('title') - return self.playlist_result(entries, course_id, course_title) + return self.playlist_result(entries, course_id, title)