Add URL-specific method allow/deny lists

Allows enabling only allow-listed extractors or disabling specific
deny-listed extractors for a regular expression matched against an added
site's URL.
This commit is contained in:
Ross Williams 2023-07-31 11:34:03 -04:00
parent 46e80dd509
commit b44f7e68b1
3 changed files with 85 additions and 16 deletions

View file

@ -124,6 +124,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
'SAVE_DENYLIST': {'type': dict, 'default': {},},
}, },
'ARCHIVE_METHOD_OPTIONS': { 'ARCHIVE_METHOD_OPTIONS': {
@ -355,6 +357,8 @@ def get_commit_hash(config):
############################## Derived Config ################################## ############################## Derived Config ##################################
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns}, 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: SYSTEM_USER}, 'USER': {'default': lambda c: SYSTEM_USER},
@ -371,8 +375,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
@ -446,10 +450,11 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
} }
################################### Helpers #################################### ################################### Helpers ####################################

View file

@ -4,12 +4,16 @@ import os
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Optional, List, Iterable, Union from typing import Callable, Optional, List, Iterable, TypeAlias, Union
from datetime import datetime, timezone from datetime import datetime, timezone
from django.db.models import QuerySet from django.db.models import QuerySet
from ..config import (
SAVE_ALLOWLIST_PTN,
SAVE_DENYLIST_PTN,
)
from ..core.settings import ERROR_LOG from ..core.settings import ERROR_LOG
from ..index.schema import Link from ..index.schema import ArchiveResult, Link
from ..index.sql import write_link_to_sql_index from ..index.sql import write_link_to_sql_index
from ..index import ( from ..index import (
load_link_details, load_link_details,
@ -42,7 +46,11 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
from .headers import should_save_headers, save_headers from .headers import should_save_headers, save_headers
def get_default_archive_methods(): ShouldSaveFunction: TypeAlias = Callable[[Link, Optional[Path], Optional[bool]], bool]
SaveFunction: TypeAlias = Callable[[Link, Optional[Path], int], ArchiveResult]
ArchiveMethodEntry: TypeAlias = tuple[str, ShouldSaveFunction, SaveFunction]
def get_default_archive_methods() -> List[ArchiveMethodEntry]:
return [ return [
('favicon', should_save_favicon, save_favicon), ('favicon', should_save_favicon, save_favicon),
('headers', should_save_headers, save_headers), ('headers', should_save_headers, save_headers),
@ -59,14 +67,31 @@ def get_default_archive_methods():
('archive_org', should_save_archive_dot_org, save_archive_dot_org), ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
] ]
@enforce_types
def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
DEFAULT_METHODS = get_default_archive_methods()
allowed_methods = {
m for pat, methods in
SAVE_ALLOWLIST_PTN.items()
if pat.search(link.url)
for m in methods
} or { m[0] for m in DEFAULT_METHODS }
denied_methods = {
m for pat, methods in
SAVE_DENYLIST_PTN.items()
if pat.search(link.url)
for m in methods
}
allowed_methods -= denied_methods
return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
@enforce_types @enforce_types
def ignore_methods(to_ignore: List[str]): def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
ARCHIVE_METHODS = get_default_archive_methods() ARCHIVE_METHODS = get_default_archive_methods()
methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS) return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]
methods = map(lambda x: x[0], methods)
return list(methods)
@enforce_types @enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
@ -79,11 +104,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
snapshot = write_link_to_sql_index(link) snapshot = write_link_to_sql_index(link)
ARCHIVE_METHODS = get_default_archive_methods() active_methods = get_archive_methods_for_link(link)
if methods: if methods:
ARCHIVE_METHODS = [ active_methods = [
method for method in ARCHIVE_METHODS method for method in active_methods
if method[0] in methods if method[0] in methods
] ]
@ -100,7 +125,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
start_ts = datetime.now(timezone.utc) start_ts = datetime.now(timezone.utc)
for method_name, should_run, method_function in ARCHIVE_METHODS: for method_name, should_run, method_function in active_methods:
try: try:
if method_name not in link.history: if method_name not in link.history:
link.history[method_name] = [] link.history[method_name] = []

View file

@ -13,12 +13,51 @@ def test_ignore_methods():
Takes the passed method out of the default methods list and returns that value Takes the passed method out of the default methods list and returns that value
""" """
ignored = ignore_methods(['title']) ignored = ignore_methods(['title'])
assert should_save_title not in ignored assert "title" not in ignored
def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
allow_list = {
r'/static': ["headers", "singlefile"],
r'example\.com\.html$': ["headers"],
}
deny_list = {
"/static": ["singlefile"],
}
disable_extractors_dict.update({
"SAVE_HEADERS": "true",
"USE_SINGLEFILE": "true",
"SAVE_ALLOWLIST": pyjson.dumps(allow_list),
"SAVE_DENYLIST": pyjson.dumps(deny_list),
})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
singlefile_file = archived_item_path / "singlefile.html"
assert not singlefile_file.exists()
headers_file = archived_item_path / "headers.json"
assert headers_file.exists()
def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
deny_list = {
"/static": ["singlefile"],
}
disable_extractors_dict.update({
"SAVE_HEADERS": "true",
"USE_SINGLEFILE": "true",
"SAVE_DENYLIST": pyjson.dumps(deny_list),
})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
singlefile_file = archived_item_path / "singlefile.html"
assert not singlefile_file.exists()
headers_file = archived_item_path / "headers.json"
assert headers_file.exists()
def test_singlefile_works(tmp_path, process, disable_extractors_dict): def test_singlefile_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_SINGLEFILE": "true"}) disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict) capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0] archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
output_file = archived_item_path / "singlefile.html" output_file = archived_item_path / "singlefile.html"
assert output_file.exists() assert output_file.exists()