From 143db31d48802e26e975a94ab27263df381c9381 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 25 Mar 2021 03:32:15 +0530 Subject: [PATCH] Parse metadata from multiple fields Closes #196 --- README.md | 30 ++-- yt_dlp/YoutubeDL.py | 164 +++++++++++----------- yt_dlp/options.py | 11 +- yt_dlp/postprocessor/metadatafromfield.py | 39 ++--- yt_dlp/utils.py | 14 ++ 5 files changed, 143 insertions(+), 115 deletions(-) diff --git a/README.md b/README.md index 593fdcb1e..ba1b01702 100644 --- a/README.md +++ b/README.md @@ -670,18 +670,24 @@ Then simply run `make`. You can also run `make yt-dlp` instead to compile only t --add-metadata Write metadata to the video file --no-add-metadata Do not write metadata (default) --parse-metadata FIELD:FORMAT Parse additional metadata like title/artist - from other fields. Give field name to - extract data from, and format of the field - seperated by a ":". Either regular - expression with named capture groups or a - similar syntax to the output template can - also be used. The parsed parameters replace - any existing values and can be use in - output template. This option can be used - multiple times. Example: --parse-metadata - "title:%(artist)s - %(title)s" matches a - title like "Coldplay - Paradise". Example - (regex): --parse-metadata + from other fields. Give a template or field + name to extract data from and the format to + interpret it as, seperated by a ":". Either + regular expression with named capture + groups or a similar syntax to the output + template can be used for the FORMAT. + Similarly, the syntax for output template + can be used for FIELD to parse the data + from multiple fields. The parsed parameters + replace any existing values and can be used + in output templates. This option can be + used multiple times. Example: --parse- + metadata "title:%(artist)s - %(title)s" + matches a title like "Coldplay - Paradise". + Example: --parse-metadata "%(series)s + %(episode_number)s:%(title)s" sets the + title using series and episode number. + Example (regex): --parse-metadata "description:Artist - (?P.+?)" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 7fbd68ce2..72e6059ea 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -67,6 +67,7 @@ from .utils import ( float_or_none, format_bytes, format_field, + FORMAT_RE, formatSeconds, GeoRestrictedError, int_or_none, @@ -772,95 +773,93 @@ class YoutubeDL(object): 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') return outtmpl_dict + def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): + """ Make the template and info_dict suitable for substitution (outtmpl % info_dict)""" + template_dict = dict(info_dict) + + # duration_string + template_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs + formatSeconds(info_dict['duration'], '-') + if info_dict.get('duration', None) is not None + else None) + + # epoch + template_dict['epoch'] = int(time.time()) + + # autonumber + autonumber_size = self.params.get('autonumber_size') + if autonumber_size is None: + autonumber_size = 5 + template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads + + # resolution if not defined + if template_dict.get('resolution') is None: + if template_dict.get('width') and template_dict.get('height'): + template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) + elif template_dict.get('height'): + template_dict['resolution'] = '%sp' % template_dict['height'] + elif template_dict.get('width'): + template_dict['resolution'] = '%dx?' % template_dict['width'] + + if sanitize is None: + sanitize = lambda k, v: v + template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) + for k, v in template_dict.items() + if v is not None and not isinstance(v, (list, tuple, dict))) + na = self.params.get('outtmpl_na_placeholder', 'NA') + template_dict = collections.defaultdict(lambda: na, template_dict) + + # For fields playlist_index and autonumber convert all occurrences + # of %(field)s to %(field)0Nd for backward compatibility + field_size_compat_map = { + 'playlist_index': len(str(template_dict['n_entries'])), + 'autonumber': autonumber_size, + } + FIELD_SIZE_COMPAT_RE = r'(?autonumber|playlist_index)\)s' + mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl) + if mobj: + outtmpl = re.sub( + FIELD_SIZE_COMPAT_RE, + r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')], + outtmpl) + + numeric_fields = list(self._NUMERIC_FIELDS) + + # Format date + FORMAT_DATE_RE = FORMAT_RE.format(r'(?P(?P\w+)>(?P.+?))') + for mobj in re.finditer(FORMAT_DATE_RE, outtmpl): + conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key') + if key in template_dict: + continue + value = strftime_or_none(template_dict.get(field), frmt, na) + if conv_type in 'crs': # string + value = sanitize(field, value) + else: # number + numeric_fields.append(key) + value = float_or_none(value, default=None) + if value is not None: + template_dict[key] = value + + # Missing numeric fields used together with integer presentation types + # in format specification will break the argument substitution since + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. + for numeric_field in numeric_fields: + if numeric_field not in template_dict: + outtmpl = re.sub( + FORMAT_RE.format(re.escape(numeric_field)), + r'%({0})s'.format(numeric_field), outtmpl) + + return outtmpl, template_dict + def _prepare_filename(self, info_dict, tmpl_type='default'): try: - template_dict = dict(info_dict) - - template_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs - formatSeconds(info_dict['duration'], '-') - if info_dict.get('duration', None) is not None - else None) - - template_dict['epoch'] = int(time.time()) - autonumber_size = self.params.get('autonumber_size') - if autonumber_size is None: - autonumber_size = 5 - template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads - if template_dict.get('resolution') is None: - if template_dict.get('width') and template_dict.get('height'): - template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) - elif template_dict.get('height'): - template_dict['resolution'] = '%sp' % template_dict['height'] - elif template_dict.get('width'): - template_dict['resolution'] = '%dx?' % template_dict['width'] - sanitize = lambda k, v: sanitize_filename( compat_str(v), restricted=self.params.get('restrictfilenames'), is_id=(k == 'id' or k.endswith('_id'))) - template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) - for k, v in template_dict.items() - if v is not None and not isinstance(v, (list, tuple, dict))) - na = self.params.get('outtmpl_na_placeholder', 'NA') - template_dict = collections.defaultdict(lambda: na, template_dict) - outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']) - force_ext = OUTTMPL_TYPES.get(tmpl_type) - - # For fields playlist_index and autonumber convert all occurrences - # of %(field)s to %(field)0Nd for backward compatibility - field_size_compat_map = { - 'playlist_index': len(str(template_dict['n_entries'])), - 'autonumber': autonumber_size, - } - FIELD_SIZE_COMPAT_RE = r'(?autonumber|playlist_index)\)s' - mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl) - if mobj: - outtmpl = re.sub( - FIELD_SIZE_COMPAT_RE, - r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')], - outtmpl) - - # As of [1] format syntax is: - # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type - # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting - FORMAT_RE = r'''(?x) - (?[diouxXeEfFgGcrs%]) # conversion type - ''' - - numeric_fields = list(self._NUMERIC_FIELDS) - - # Format date - FORMAT_DATE_RE = FORMAT_RE.format(r'(?P(?P\w+)>(?P.+?))') - for mobj in re.finditer(FORMAT_DATE_RE, outtmpl): - conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key') - if key in template_dict: - continue - value = strftime_or_none(template_dict.get(field), frmt, na) - if conv_type in 'crs': # string - value = sanitize(field, value) - else: # number - numeric_fields.append(key) - value = float_or_none(value, default=None) - if value is not None: - template_dict[key] = value - - # Missing numeric fields used together with integer presentation types - # in format specification will break the argument substitution since - # string NA placeholder is returned for missing fields. We will patch - # output template for missing fields to meet string presentation type. - for numeric_field in numeric_fields: - if numeric_field not in template_dict: - outtmpl = re.sub( - FORMAT_RE.format(re.escape(numeric_field)), - r'%({0})s'.format(numeric_field), outtmpl) + outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize) # expand_path translates '%%' into '%' and '$$' into '$' # correspondingly that is not what we want since we need to keep @@ -875,6 +874,7 @@ class YoutubeDL(object): # title "Hello $PATH", we don't want `$PATH` to be expanded. filename = expand_path(outtmpl).replace(sep, '') % template_dict + force_ext = OUTTMPL_TYPES.get(tmpl_type) if force_ext is not None: filename = replace_extension(filename, force_ext, template_dict.get('ext')) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 49a275252..95ef27e26 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1147,13 +1147,18 @@ def parseOpts(overrideArguments=None): metavar='FIELD:FORMAT', dest='metafromfield', action='append', help=( 'Parse additional metadata like title/artist from other fields. ' - 'Give field name to extract data from, and format of the field seperated by a ":". ' + 'Give a template or field name to extract data from and the ' + 'format to interpret it as, seperated by a ":". ' 'Either regular expression with named capture groups or a ' - 'similar syntax to the output template can also be used. ' - 'The parsed parameters replace any existing values and can be use in output template. ' + 'similar syntax to the output template can be used for the FORMAT. ' + 'Similarly, the syntax for output template can be used for FIELD ' + 'to parse the data from multiple fields. ' + 'The parsed parameters replace any existing values and can be used in output templates. ' 'This option can be used multiple times. ' 'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like ' '"Coldplay - Paradise". ' + 'Example: --parse-metadata "%(series)s %(episode_number)s:%(title)s" ' + 'sets the title using series and episode number. ' 'Example (regex): --parse-metadata "description:Artist - (?P.+?)"')) postproc.add_option( '--xattrs', diff --git a/yt_dlp/postprocessor/metadatafromfield.py b/yt_dlp/postprocessor/metadatafromfield.py index 716911b21..e67e591e1 100644 --- a/yt_dlp/postprocessor/metadatafromfield.py +++ b/yt_dlp/postprocessor/metadatafromfield.py @@ -8,7 +8,7 @@ from ..utils import str_or_none class MetadataFromFieldPP(PostProcessor): - regex = r'(?P\w+):(?P.+)$' + regex = r'(?P.+):(?P.+)$' def __init__(self, downloader, formats): PostProcessor.__init__(self, downloader) @@ -19,11 +19,20 @@ class MetadataFromFieldPP(PostProcessor): match = re.match(self.regex, f) assert match is not None self._data.append({ - 'field': match.group('field'), - 'format': match.group('format'), - 'regex': self.format_to_regex(match.group('format'))}) + 'in': match.group('in'), + 'out': match.group('out'), + 'tmpl': self.field_to_template(match.group('in')), + 'regex': self.format_to_regex(match.group('out')), + }) - def format_to_regex(self, fmt): + @staticmethod + def field_to_template(tmpl): + if re.match(r'\w+$', tmpl): + return '%%(%s)s' % tmpl + return tmpl + + @staticmethod + def format_to_regex(fmt): r""" Converts a string like '%(title)s - %(artist)s' @@ -37,7 +46,7 @@ class MetadataFromFieldPP(PostProcessor): # replace %(..)s with regex group and escape other string parts for match in re.finditer(r'%\((\w+)\)s', fmt): regex += re.escape(fmt[lastpos:match.start()]) - regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)' + regex += r'(?P<%s>[^\r\n]+)' % match.group(1) lastpos = match.end() if lastpos < len(fmt): regex += re.escape(fmt[lastpos:]) @@ -45,22 +54,16 @@ class MetadataFromFieldPP(PostProcessor): def run(self, info): for dictn in self._data: - field, regex = dictn['field'], dictn['regex'] - if field not in info: - self.report_warning('Video doesnot have a %s' % field) - continue - data_to_parse = str_or_none(info[field]) - if data_to_parse is None: - self.report_warning('Field %s cannot be parsed' % field) - continue - self.write_debug('Searching for r"%s" in %s' % (regex, field)) - match = re.search(regex, data_to_parse) + tmpl, info_copy = self._downloader.prepare_outtmpl(dictn['tmpl'], info) + data_to_parse = tmpl % info_copy + self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], tmpl)) + match = re.search(dictn['regex'], data_to_parse) if match is None: - self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format'])) + self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out'])) continue for attribute, value in match.groupdict().items(): info[attribute] = value - self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA')) + self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['in'], value if value is not None else 'NA')) return [], info diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 3a8725c21..c14fdb509 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -4205,6 +4205,20 @@ OUTTMPL_TYPES = { 'pl_infojson': 'info.json', } +# As of [1] format syntax is: +# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type +# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting +FORMAT_RE = r'''(?x) + (?[diouxXeEfFgGcrs%]) # conversion type +''' + def limit_length(s, length): """ Add ellipses to overly long strings """