From 78fb87b2837e15124b5855734a951598dfe025fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 12:54:13 +0100 Subject: [PATCH] Don't accept '>' inside the content attribute in OpenGraph regexes --- youtube_dl/extractor/common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e021768528..45dd01789b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -316,10 +316,12 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - esc_prop = re.escape(prop) + content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' + property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) + template = r']+?%s[^>]+?%s' return [ - r']+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, - r']+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, + template % (property_re, content_re), + template % (content_re, property_re), ] def _og_search_property(self, prop, html, name=None, **kargs):