From 0d2a0ecac3d721b4b01ebc2f00f922740961e515 Mon Sep 17 00:00:00 2001 From: Alex Karabanov Date: Sun, 6 Nov 2022 22:30:59 +0400 Subject: [PATCH] [extractor/listennotes] Add extractor (#5310) Closes #5262 Authored by: lksj, pukkandan --- yt_dlp/compat/__init__.py | 2 +- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/listennotes.py | 86 +++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/listennotes.py diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index 6d85a6a1f..5d3db4b4c 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -14,7 +14,7 @@ passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( # HTMLParseError has been deprecated in Python 3.3 and removed in # Python 3.5. Introducing dummy exception for Python >3.5 for compatible # and uniform cross-version exception handling -class compat_HTMLParseError(Exception): +class compat_HTMLParseError(ValueError): pass diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1960692ef..8c70d1585 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -912,6 +912,7 @@ from .linkedin import ( ) from .linuxacademy import LinuxAcademyIE from .liputan6 import Liputan6IE +from .listennotes import ListenNotesIE from .litv import LiTVIE from .livejournal import LiveJournalIE from .livestream import ( diff --git a/yt_dlp/extractor/listennotes.py b/yt_dlp/extractor/listennotes.py new file mode 100644 index 000000000..4ebc9be4d --- /dev/null +++ b/yt_dlp/extractor/listennotes.py @@ -0,0 +1,86 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + get_element_text_and_html_by_tag, + parse_duration, + strip_or_none, + traverse_obj, + try_call, +) + + +class ListenNotesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?listennotes\.com/podcasts/[^/]+/[^/]+-(?P.+)/' + _TESTS = [{ + 'url': 'https://www.listennotes.com/podcasts/thriving-on-overload/tim-oreilly-on-noticing-KrDgvNb_u1n/', + 'md5': '5b91a32f841e5788fb82b72a1a8af7f7', + 'info_dict': { + 'id': 'KrDgvNb_u1n', + 'ext': 'mp3', + 'title': 'md5:32236591a921adf17bbdbf0441b6c0e9', + 'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd', + 'duration': 2148.0, + 'channel': 'Thriving on Overload', + 'channel_id': 'ed84wITivxF', + 'episode_id': 'e1312583fa7b4e24acfbb5131050be00', + 'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg', + 'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/', + 'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'], + } + }, { + 'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/', + 'md5': '62fb4ffe7fc525632a1138bf72a5ce53', + 'info_dict': { + 'id': 'lwEA3154JzG', + 'ext': 'mp3', + 'title': 'Episode 177: WireGuard with Jason Donenfeld', + 'description': 'md5:24744f36456a3e95f83c1193a3458594', + 'duration': 3861.0, + 'channel': 'Ask Noah Show', + 'channel_id': '4DQTzdS5-j7', + 'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4', + 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/', + 'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg', + 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'], + } + }] + + def _clean_description(self, description): + return clean_html(re.sub(r'(\s*)+', '

', description or '')) + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + data = self._search_json( + r'