Add URL-specific method allow/deny lists

Allows enabling only allow-listed extractors or disabling specific deny-listed extractors for a regular expression matched against an added site's URL.
2024-11-21 19:53:06 +00:00 · 2023-07-31 11:34:03 -04:00 · 2023-07-31 11:34:03 -04:00 · b44f7e68b1
commit b44f7e68b1
parent 46e80dd509
3 changed files with 85 additions and 16 deletions
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -124,6 +124,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
        'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},
        'SAVE_ARCHIVE_DOT_ORG':     {'type': bool,  'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
+        'SAVE_ALLOWLIST':           {'type': dict,  'default': {},},
+        'SAVE_DENYLIST':            {'type': dict,  'default': {},},
    },

    'ARCHIVE_METHOD_OPTIONS': {
@ -355,6 +357,8 @@ def get_commit_hash(config):
 ############################## Derived Config ##################################


+ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
+
 DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'TERM_WIDTH':               {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
    'USER':                     {'default': lambda c: SYSTEM_USER},
@ -371,8 +375,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
    'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)},   # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
-    'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
-    'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
+    'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
+    'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
    'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},

    'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
@ -446,10 +450,11 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
    'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
    'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
+    'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
+    'SAVE_DENYLIST_PTN':       {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
 }


-
 ################################### Helpers ####################################


--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -4,12 +4,16 @@ import os
 import sys
 from pathlib import Path

-from typing import Optional, List, Iterable, Union
+from typing import Callable, Optional, List, Iterable, TypeAlias, Union
 from datetime import datetime, timezone
 from django.db.models import QuerySet

+from ..config import (
+    SAVE_ALLOWLIST_PTN,
+    SAVE_DENYLIST_PTN,
+)
 from ..core.settings import ERROR_LOG
-from ..index.schema import Link
+from ..index.schema import ArchiveResult, Link
 from ..index.sql import write_link_to_sql_index
 from ..index import (
    load_link_details,
@ -42,7 +46,11 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
 from .headers import should_save_headers, save_headers


-def get_default_archive_methods():
+ShouldSaveFunction: TypeAlias = Callable[[Link, Optional[Path], Optional[bool]], bool]
+SaveFunction: TypeAlias = Callable[[Link, Optional[Path], int], ArchiveResult]
+ArchiveMethodEntry: TypeAlias = tuple[str, ShouldSaveFunction, SaveFunction]
+
+def get_default_archive_methods() -> List[ArchiveMethodEntry]:
    return [
        ('favicon', should_save_favicon, save_favicon),
        ('headers', should_save_headers, save_headers),
@ -59,14 +67,31 @@ def get_default_archive_methods():
        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
    ]

+@enforce_types
+def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
+    DEFAULT_METHODS = get_default_archive_methods()
+    allowed_methods = {
+        m for pat, methods in
+        SAVE_ALLOWLIST_PTN.items()
+        if pat.search(link.url)
+        for m in methods
+    } or { m[0] for m in DEFAULT_METHODS }
+    denied_methods = {
+        m for pat, methods in
+        SAVE_DENYLIST_PTN.items()
+        if pat.search(link.url)
+        for m in methods
+    }
+    allowed_methods -= denied_methods
+
+    return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
+
 ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]

@enforce_types
-def ignore_methods(to_ignore: List[str]):
+def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
    ARCHIVE_METHODS = get_default_archive_methods()
-    methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS)
-    methods = map(lambda x: x[0], methods)
-    return list(methods)
+    return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]

@enforce_types
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
@ -79,11 +104,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
    except Snapshot.DoesNotExist:
        snapshot = write_link_to_sql_index(link)

-    ARCHIVE_METHODS = get_default_archive_methods()
+    active_methods = get_archive_methods_for_link(link)
    
    if methods:
-        ARCHIVE_METHODS = [
-            method for method in ARCHIVE_METHODS
+        active_methods = [
+            method for method in active_methods
            if method[0] in methods
        ]

@ -100,7 +125,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
        start_ts = datetime.now(timezone.utc)

-        for method_name, should_run, method_function in ARCHIVE_METHODS:
+        for method_name, should_run, method_function in active_methods:
            try:
                if method_name not in link.history:
                    link.history[method_name] = []
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@ -13,12 +13,51 @@ def test_ignore_methods():
    Takes the passed method out of the default methods list and returns that value
    """
    ignored = ignore_methods(['title'])
-    assert should_save_title not in ignored
+    assert "title" not in ignored
+
+def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
+    allow_list = {
+        r'/static': ["headers", "singlefile"],
+        r'example\.com\.html$': ["headers"],
+    }
+    deny_list = {
+        "/static": ["singlefile"],
+    }
+    disable_extractors_dict.update({
+        "SAVE_HEADERS": "true",
+        "USE_SINGLEFILE": "true",
+        "SAVE_ALLOWLIST": pyjson.dumps(allow_list),
+        "SAVE_DENYLIST": pyjson.dumps(deny_list),
+    })
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    singlefile_file = archived_item_path / "singlefile.html"
+    assert not singlefile_file.exists()
+    headers_file = archived_item_path / "headers.json"
+    assert headers_file.exists()
+
+def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
+    deny_list = {
+        "/static": ["singlefile"],
+    }
+    disable_extractors_dict.update({
+        "SAVE_HEADERS": "true",
+        "USE_SINGLEFILE": "true",
+        "SAVE_DENYLIST": pyjson.dumps(deny_list),
+    })
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    singlefile_file = archived_item_path / "singlefile.html"
+    assert not singlefile_file.exists()
+    headers_file = archived_item_path / "headers.json"
+    assert headers_file.exists()

 def test_singlefile_works(tmp_path, process, disable_extractors_dict):
    disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
-                                  capture_output=True, env=disable_extractors_dict) 
+                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
    output_file = archived_item_path / "singlefile.html" 
    assert output_file.exists()