From 9c1f99402fa25a5a691944c133432741af19829b Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Wed, 23 Sep 2020 23:09:00 +0200 Subject: [PATCH 1/7] [bandcamp] fix regexp for JSON matching on bandcamp --- youtube_dl/extractor/bandcamp.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc8..ad18123206 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -91,10 +91,11 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + trackinfo_block = self._search_regex( + r'trackinfo":\[\s*({.+?})\s*\],"', + webpage, 'track info', default='{}') + quoted_json = trackinfo_block.replace('"', '"') + track_info = self._parse_json(quoted_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -117,7 +118,7 @@ class BandcampIE(InfoExtractor): def extract(key): return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % key, + r',"%s":(")(?P(?:(?!").)+)"' % key, webpage, key, default=None, group='value') artist = extract('artist') From 14194392a813a12b3a1477ec75bcd0c8626ef3bb Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sat, 26 Sep 2020 17:34:35 +0200 Subject: [PATCH 2/7] [bandcamp] use unescapeHTML instead of a simple replace of quotes --- youtube_dl/extractor/bandcamp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index ad18123206..55d110e280 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -92,10 +92,10 @@ class BandcampIE(InfoExtractor): formats = [] trackinfo_block = self._search_regex( - r'trackinfo":\[\s*({.+?})\s*\],"', + r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - quoted_json = trackinfo_block.replace('"', '"') - track_info = self._parse_json(quoted_json, title) + unescaped_json = unescapeHTML(trackinfo_block) + track_info = self._parse_json(unescaped_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -118,7 +118,7 @@ class BandcampIE(InfoExtractor): def extract(key): return self._search_regex( - r',"%s":(")(?P(?:(?!").)+)"' % key, + r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, webpage, key, default=None, group='value') artist = extract('artist') From f43a856334b633e3d2f778b455fb08a4a06fbf51 Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sun, 27 Sep 2020 14:51:42 +0200 Subject: [PATCH 3/7] [bandcamp] match album titles inside the new JSON data block, and unescape the title properly --- youtube_dl/extractor/bandcamp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 55d110e280..f036a89ebd 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -316,10 +316,10 @@ class BandcampAlbumIE(InfoExtractor): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', + r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', webpage, 'title', fatal=False) if title: - title = title.replace(r'\"', '"') + title = unescapeHTML(title) return { '_type': 'playlist', 'uploader_id': uploader_id, From 9385ec4b1c797ffab66b945f23fd4248c0c8a32e Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sun, 27 Sep 2020 15:11:08 +0200 Subject: [PATCH 4/7] [bandcamp] fix the freeDownloadPage JSON lookup, and use the id from the URL to match the tracks --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f036a89ebd..eccb867a0d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -128,12 +128,12 @@ class BandcampIE(InfoExtractor): release_date = unified_strdate(extract('album_release_date')) download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P(?:(?!\1).)+)\1', webpage, 'download link', default=None, group='url') if download_link: track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P\d+),?$', - webpage, 'track id') + r'\?id=(?P\d+)&', + download_link, 'track id') download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') From 37f625598cb9b02cb06b3f12033cc29699d70818 Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sun, 27 Sep 2020 15:52:55 +0200 Subject: [PATCH 5/7] [bandcamp] update youtuble dl test song information to match title as artist - track, and add missing keys from info_dict --- youtube_dl/extractor/bandcamp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index eccb867a0d..3d32b1e0f2 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,8 +33,11 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, + 'uploader': 'youtube-dl \\', + 'timestamp': 1354224127, + 'upload_date': '20121129', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { From 75a83afe3b8fd9dfe242ca2de428c313a2bd3e0e Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Mon, 28 Sep 2020 19:42:56 +0200 Subject: [PATCH 6/7] [bandcamp] fix test song uploader name, cleanup remanings " and \ in data, including album titles --- youtube_dl/extractor/bandcamp.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3d32b1e0f2..3405b570af 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,9 +33,9 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, - 'uploader': 'youtube-dl \\', + 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, 'upload_date': '20121129', }, @@ -43,7 +43,7 @@ class BandcampIE(InfoExtractor): }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', + 'md5': '5d92af55811e47f38962a54c30b07ef0', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -94,11 +94,12 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - trackinfo_block = self._search_regex( + trackinfo_block = self._html_search_regex( r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - unescaped_json = unescapeHTML(trackinfo_block) - track_info = self._parse_json(unescaped_json, title) + + track_info = self._parse_json(trackinfo_block, title) + if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -120,9 +121,10 @@ class BandcampIE(InfoExtractor): duration = float_or_none(track_info.get('duration')) def extract(key): - return self._search_regex( - r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, + data = self._html_search_regex( + r',(["\']|")%s\1:\1(?P(?:\\\1|((?!\1).))+)\1' % key, webpage, key, default=None, group='value') + return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data artist = extract('artist') album = extract('album_title') @@ -319,10 +321,12 @@ class BandcampAlbumIE(InfoExtractor): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', - webpage, 'title', fatal=False) + r'album_title\s*(?:"|["\']):\s*("|["\'])(?P(?:\\\1|((?!\1).))+)\1', + webpage, 'title', fatal=False, group='album') + if title: - title = unescapeHTML(title) + title = title.replace(r'\"', '"') + return { '_type': 'playlist', 'uploader_id': uploader_id, From 03edd545a9e14b0fbcb36574248d8cf0e7a224d6 Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Tue, 29 Sep 2020 12:09:55 +0200 Subject: [PATCH 7/7] [bandcamp] Revert test song title, and extract title generally (which may fail, as the other title json values might come up), instead of out of trackinfo, as bandcamp prefixes it with artist - --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3405b570af..04b8aa80f9 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,7 +33,7 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, @@ -99,7 +99,6 @@ class BandcampIE(InfoExtractor): webpage, 'track info', default='{}') track_info = self._parse_json(trackinfo_block, title) - if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -115,7 +114,7 @@ class BandcampIE(InfoExtractor): 'acodec': ext, 'abr': int_or_none(abr_str), }) - track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) @@ -126,6 +125,7 @@ class BandcampIE(InfoExtractor): webpage, key, default=None, group='value') return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data + track = extract('title') artist = extract('artist') album = extract('album_title') timestamp = unified_timestamp(