From e4dc2701efb789f5164fedac9f3964bb75e8c932 Mon Sep 17 00:00:00 2001 From: longzai <437172242@qq.com> Date: Thu, 11 Apr 2024 15:51:55 +0800 Subject: [PATCH] fix URL_REGEX 2 --- archivebox/util.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index bccf3553..61d6322e 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -59,12 +59,11 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat() URL_REGEX = re.compile( r'(?=(' r'https?://' #match schemes http and https,but can't match ftp - r'(?:[A-Za-z0-9-]+\.)+[A-Za-z0-9-]+'#match domain - r'(?::\d+)?' #match port,mabey not occur - r'(?:/[^\\#\f\n\r\t\v]*)?' #match path and query,maybe not occur -## r'(?:#[^\]\[\(\)<>"\'\s]*){0,1}' #match fragment,but we don't need it actually + r'(?:[A-Za-z0-9-]+\.)*[A-Za-z0-9-]+'#match domain + r'[^\\#\f\n\r\t\v?&]*' #exclude '#' because don't need fragment, + #exclude '?' and '&' because url is invalid when '&' appear before '?' + r'(?:\?[^\\#\f\n\r\t\v]*)*' r'))', -## re.IGNORECASE, #don't need to consider case problem ) COLOR_REGEX = re.compile(r'\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m')