From e4974d3536973dab4b5e04cb9061a102db27c968 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 6 Jul 2021 23:17:03 -0400 Subject: [PATCH] support negation patterns by checking both re.search and re.match --- archivebox/index/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 252244f1..198cc563 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -141,7 +141,10 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]: continue if scheme(link.url) not in ('http', 'https', 'ftp'): continue - if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): + if URL_BLACKLIST_PTN and (URL_BLACKLIST_PTN.match(link.url) or URL_BLACKLIST_PTN.search(link.url)): + # https://stackoverflow.com/questions/180986/what-is-the-difference-between-re-search-and-re-match + # we want both behaviors in order to support multiple patterns in the regex, + # and negation regexes like (?!someptnhere) to allow for whitelisting continue yield link