move wget and mercury into plugins

2024-11-22 12:13:05 +00:00 · 2024-09-30 21:43:45 -07:00 · 2024-09-30 21:43:45 -07:00 · 69522da4bb
commit 69522da4bb
parent dce79d63c6
6 changed files with 341 additions and 107 deletions
--- a/archivebox/config/legacy.py
+++ b/archivebox/config/legacy.py
@ -59,6 +59,7 @@ from ..misc.logging import (
 from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
 from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
 ANSI = SHELL_CONFIG.ANSI
 LDAP = LDAP_CONFIG.LDAP_ENABLED
@ -81,6 +82,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
    'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
    'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
    'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
    'ARCHIVE_METHOD_TOGGLES': {
@ -112,7 +115,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'USER_AGENT':               {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, #  + ' wget/{WGET_VERSION}'},
        'COOKIES_FILE':             {'type': str,   'default': None},
@ -143,16 +145,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'YOUTUBEDL_EXTRA_ARGS':     {'type': list,  'default': None},
        'WGET_ARGS':                {'type': list,  'default': ['--no-verbose',
                                                                '--adjust-extension',
                                                                '--convert-links',
                                                                '--force-directories',
                                                                '--backup-converted',
                                                                '--span-hosts',
                                                                '--no-parent',
                                                                '-e', 'robots=off',
                                                                ]},
        'WGET_EXTRA_ARGS':          {'type': list,  'default': None},
        'CURL_ARGS':                {'type': list,  'default': ['--silent',
                                                                '--location',
                                                                '--compressed'
@ -161,16 +153,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
        'MERCURY_ARGS':             {'type': list,  'default': ['--format=text']},
        'MERCURY_EXTRA_ARGS':       {'type': list,  'default': None},
    },
    'DEPENDENCY_CONFIG': {
        'USE_CURL':                 {'type': bool,  'default': True},
        'USE_WGET':                 {'type': bool,  'default': True},
        'USE_SINGLEFILE':           {'type': bool,  'default': True},
        'USE_READABILITY':          {'type': bool,  'default': True},
        'USE_MERCURY':              {'type': bool,  'default': True},
        'USE_GIT':                  {'type': bool,  'default': True},
        'USE_CHROME':               {'type': bool,  'default': True},
        'USE_YOUTUBEDL':            {'type': bool,  'default': True},
@ -178,8 +166,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'CURL_BINARY':              {'type': str,   'default': 'curl'},
        'GIT_BINARY':               {'type': str,   'default': 'git'},
        'WGET_BINARY':              {'type': str,   'default': 'wget'},     # also can accept wget2
        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('postlight-parser')},
        'NODE_BINARY':              {'type': str,   'default': 'node'},
        # 'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
        # 'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
@ -232,21 +218,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
    'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
    'USE_WGET':                 {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
    'WGET_VERSION':             {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
    'WGET_AUTO_COMPRESSION':    {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
    # 'WGET_USER_AGENT':          {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
    'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
    'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
    'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
    'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
    'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
    'SAVE_MERCURY':             {'default': lambda c: c['USE_MERCURY']},
    'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
    'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
    'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
    'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
@ -649,13 +620,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
            'enabled': config['USE_CURL'],
            'is_valid': bool(config['CURL_VERSION']),
        },
-        'WGET_BINARY': {
+        # 'WGET_BINARY': {
-            'path': bin_path(config['WGET_BINARY']),
+        #     'path': bin_path(config['WGET_BINARY']),
-            'version': config['WGET_VERSION'],
+        #     'version': config['WGET_VERSION'],
-            'hash': bin_hash(config['WGET_BINARY']),
+        #     'hash': bin_hash(config['WGET_BINARY']),
-            'enabled': config['USE_WGET'],
+        #     'enabled': config['USE_WGET'],
-            'is_valid': bool(config['WGET_VERSION']),
+        #     'is_valid': bool(config['WGET_VERSION']),
-        },
+        # },
        # 'NODE_BINARY': {
        #     'path': bin_path(config['NODE_BINARY']),
        #     'version': config['NODE_VERSION'],
@ -663,13 +634,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
        #     'enabled': config['USE_NODE'],
        #     'is_valid': bool(config['NODE_VERSION']),
        # },
-        'MERCURY_BINARY': {
+        # 'MERCURY_BINARY': {
-            'path': bin_path(config['MERCURY_BINARY']),
+        #     'path': bin_path(config['MERCURY_BINARY']),
-            'version': config['MERCURY_VERSION'],
+        #     'version': config['MERCURY_VERSION'],
-            'hash': bin_hash(config['MERCURY_BINARY']),
+        #     'hash': bin_hash(config['MERCURY_BINARY']),
-            'enabled': config['USE_MERCURY'],
+        #     'enabled': config['USE_MERCURY'],
-            'is_valid': bool(config['MERCURY_VERSION']),
+        #     'is_valid': bool(config['MERCURY_VERSION']),
-        },
+        # },
        'GIT_BINARY': {
            'path': bin_path(config['GIT_BINARY']),
            'version': config['GIT_VERSION'],
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@ -11,16 +11,9 @@ from archivebox.misc.system import run, atomic_write
 from archivebox.misc.util import (
    enforce_types,
    is_static_file,
    dedupe,
 )
 from ..config.legacy import (
    TIMEOUT,
    SAVE_MERCURY,
    DEPENDENCIES,
    MERCURY_VERSION,
    MERCURY_ARGS,
    MERCURY_EXTRA_ARGS,
 )
 from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
 from ..logging_util import TimedProgress
@ -49,35 +42,36 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
    if is_static_file(link.url):
        return False
-    out_dir = out_dir or Path(link.link_dir)
+    out_dir = Path(out_dir or link.link_dir)
    if not overwrite and (out_dir / get_output_path()).exists():
        return False
-    return SAVE_MERCURY
+    return MERCURY_CONFIG.SAVE_MERCURY
@enforce_types
-def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=MERCURY_CONFIG.MERCURY_TIMEOUT) -> ArchiveResult:
    """download reader friendly version using @postlight/mercury-parser"""
    out_dir = Path(out_dir or link.link_dir)
    output_folder = out_dir.absolute() / get_output_path()
    output = get_output_path()
    mercury_binary = MERCURY_BINARY.load()
    assert mercury_binary.abspath and mercury_binary.version
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        output_folder.mkdir(exist_ok=True)
        # later options take precedence
        options = [
            *MERCURY_ARGS,
            *MERCURY_EXTRA_ARGS,
        ]
        # By default, get plain text version of article
        cmd = [
-            DEPENDENCIES['MERCURY_BINARY']['path'],
+            str(mercury_binary.abspath),
            *MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
            '--format=text',
            link.url,
            *dedupe(options)
        ]
        result = run(cmd, cwd=out_dir, timeout=timeout)
        try:
@ -92,7 +86,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
        # Get HTML version of article
        cmd = [
-            DEPENDENCIES['MERCURY_BINARY']['path'],
+            str(mercury_binary.abspath),
            *MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
            link.url
        ]
        result = run(cmd, cwd=out_dir, timeout=timeout)
@ -119,7 +114,7 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
    return ArchiveResult(
        cmd=cmd,
        pwd=str(out_dir),
-        cmd_version=MERCURY_VERSION,
+        cmd_version=str(mercury_binary.version),
        output=output,
        status=status,
        **timer.stats,
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -6,7 +6,6 @@ from pathlib import Path
 from typing import Optional
 from datetime import datetime, timezone
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import (
    enforce_types,
@ -17,22 +16,10 @@ from archivebox.misc.util import (
    urldecode,
    dedupe,
 )
-from ..config.legacy import (
+from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
-    WGET_ARGS,
+
    WGET_EXTRA_ARGS,
    TIMEOUT,
    SAVE_WGET,
    SAVE_WARC,
    WGET_BINARY,
    WGET_VERSION,
    RESTRICT_FILE_NAMES,
    CHECK_SSL_VALIDITY,
    SAVE_WGET_REQUISITES,
    WGET_AUTO_COMPRESSION,
    WGET_USER_AGENT,
    COOKIES_FILE,
 )
 from ..logging_util import TimedProgress
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 def get_output_path():
@ -54,38 +41,43 @@ def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Option
    if not overwrite and output_path and (out_dir / output_path).exists():
        return False
-    return SAVE_WGET
+    return WGET_CONFIG.SAVE_WGET
@enforce_types
-def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=WGET_CONFIG.WGET_TIMEOUT) -> ArchiveResult:
    """download full site using wget"""
-    out_dir = out_dir or link.link_dir
+    out_dir = Path(out_dir or link.link_dir)
-    if SAVE_WARC:
+    assert out_dir.exists()
    if WGET_CONFIG.SAVE_WARC:
        warc_dir = out_dir / "warc"
        warc_dir.mkdir(exist_ok=True)
        warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
    wget_binary = WGET_BINARY.load()
    assert wget_binary.abspath and wget_binary.version
    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output: ArchiveOutput = None
    # later options take precedence
    options = [
-        *WGET_ARGS,
+        *WGET_CONFIG.WGET_ARGS,
-        *WGET_EXTRA_ARGS,
+        *WGET_CONFIG.WGET_EXTRA_ARGS,
        '--timeout={}'.format(timeout),
-        *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
+        *(['--restrict-file-names={}'.format(WGET_CONFIG.WGET_RESTRICT_FILE_NAMES)] if WGET_CONFIG.WGET_RESTRICT_FILE_NAMES else []),
-        *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
+        *(['--warc-file={}'.format(str(warc_path))] if WGET_CONFIG.SAVE_WARC else []),
-        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
+        *(['--page-requisites'] if WGET_CONFIG.SAVE_WGET_REQUISITES else []),
-        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
+        *(['--user-agent={}'.format(WGET_CONFIG.WGET_USER_AGENT)] if WGET_CONFIG.WGET_USER_AGENT else []),
-        *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
+        *(['--load-cookies', str(WGET_CONFIG.WGET_COOKIES_FILE)] if WGET_CONFIG.WGET_COOKIES_FILE else []),
-        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
+        *(['--compression=auto'] if WGET_CONFIG.WGET_AUTO_COMPRESSION else []),
-        *([] if SAVE_WARC else ['--timestamping']),
+        *([] if WGET_CONFIG.SAVE_WARC else ['--timestamping']),
-        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+        *([] if WGET_CONFIG.WGET_CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
        # '--server-response',  # print headers for better error parsing
    ]
    cmd = [
-        WGET_BINARY,
+        str(wget_binary.abspath),
        *dedupe(options),
        link.url,
    ]
@ -137,7 +129,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
    return ArchiveResult(
        cmd=cmd,
        pwd=str(out_dir),
-        cmd_version=WGET_VERSION,
+        cmd_version=str(wget_binary.version),
        output=output,
        status=status,
        **timer.stats,
--- a/archivebox/plugins_extractor/mercury/apps.py
+++ b/archivebox/plugins_extractor/mercury/apps.py
@ -0,0 +1,82 @@
 __package__ = 'plugins_extractor.mercury'
 from typing import List, Optional, Dict
 from pathlib import Path
 from subprocess import run
 from pydantic import InstanceOf, Field
 from pydantic_pkgr import BinProvider, BinName, bin_abspath
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
 from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
 class MercuryConfig(BaseConfigSet):
    SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
    MERCURY_BINARY: str = Field(default='postlight-parser')
    MERCURY_EXTRA_ARGS: List[str] = []
    SAVE_MERCURY_REQUISITES: bool = Field(default=True)
    MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
    MERCURY_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
    MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
    MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
    MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
 MERCURY_CONFIG = MercuryConfig()
 class MercuryBinary(BaseBinary):
    name: BinName = MERCURY_CONFIG.MERCURY_BINARY
    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
        LIB_NPM_BINPROVIDER.name: {
            'packages': lambda: ['@postlight/parser@^2.2.3'],
            'version': lambda: run([str(LIB_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), f'--prefix={LIB_NPM_BINPROVIDER.npm_prefix}', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(),
        },
        SYS_NPM_BINPROVIDER.name: {
            'packages': lambda: [],   # never try to install things globally
            'version': lambda: run([str(SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), '-g', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(),
        },
        env.name: {
            'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
        },
    }
 MERCURY_BINARY = MercuryBinary()
 class MercuryExtractor(BaseExtractor):
    name: ExtractorName = 'mercury'
    binary: str = MERCURY_BINARY.name
    def get_output_path(self, snapshot) -> Path | None:
        return snapshot.link_dir / 'mercury' / 'content.html'
 MERCURY_EXTRACTOR = MercuryExtractor()
 class MercuryPlugin(BasePlugin):
    app_label: str = 'mercury'
    verbose_name: str = 'MERCURY'
    hooks: List[InstanceOf[BaseHook]] = [
        MERCURY_CONFIG,
        MERCURY_BINARY,
        MERCURY_EXTRACTOR,
    ]
 PLUGIN = MercuryPlugin()
 DJANGO_APP = PLUGIN.AppConfig
--- a/archivebox/plugins_extractor/wget/apps.py
+++ b/archivebox/plugins_extractor/wget/apps.py
@ -1,17 +1,21 @@
 __package__ = 'plugins_extractor.wget'
 import sys
-from typing import List, Optional
+from typing import List, Optional, Dict
 from pathlib import Path
 from subprocess import run, DEVNULL
 from rich import print
 from pydantic import InstanceOf, Field, model_validator
-from pydantic_pkgr import BinProvider, BinName
+from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-from archivebox.extractors.wget import wget_output_path
+from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
 from .wget_util import wget_output_path
 class WgetConfig(BaseConfigSet):
@ -34,13 +38,13 @@ class WgetConfig(BaseConfigSet):
    ]
    WGET_EXTRA_ARGS: List[str] = []
    WGET_AUTO_COMPRESSION: bool = Field(default=True)
    SAVE_WGET_REQUISITES: bool = Field(default=True)
-    WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT')
+    WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
-    WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
+    
-    WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
+    WGET_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
-    WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
+    WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
-    WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
+    WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
    WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
    @model_validator(mode='after')
    def validate_use_ytdlp(self):
@ -53,6 +57,22 @@ class WgetConfig(BaseConfigSet):
            print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
            print(file=sys.stderr)
        return self
    @property
    def WGET_AUTO_COMPRESSION(self) -> bool:
        if hasattr(self, '_WGET_AUTO_COMPRESSION'):
            return self._WGET_AUTO_COMPRESSION
        try:
            cmd = [
                self.WGET_BINARY,
                "--compression=auto",
                "--help",
            ]
            self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
            return self._WGET_AUTO_COMPRESSION
        except (FileNotFoundError, OSError):
            self._WGET_AUTO_COMPRESSION = False
            return False
 WGET_CONFIG = WgetConfig()
@ -60,6 +80,12 @@ WGET_CONFIG = WgetConfig()
 class WgetBinary(BaseBinary):
    name: BinName = WGET_CONFIG.WGET_BINARY
    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
        brew.name: {
            'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
        },
    }
 WGET_BINARY = WgetBinary()
--- a/archivebox/plugins_extractor/wget/wget_util.py
+++ b/archivebox/plugins_extractor/wget/wget_util.py
@ -0,0 +1,168 @@
 __package__ = 'archivebox.extractors'
 import re
 from pathlib import Path
 from typing import Optional
 from archivebox.misc.util import (
    enforce_types,
    without_fragment,
    without_query,
    path,
    domain,
    urldecode,
 )
@enforce_types
 def unsafe_wget_output_path(link) -> Optional[str]:
    # There used to be a bunch of complex reverse-engineering path mapping logic here,
    # but it was removed in favor of just walking through the output folder recursively to try to find the
    # html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
    # one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
    # But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
    full_path = without_fragment(without_query(path(link.url))).strip('/')
    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
    for _ in range(4):
        try:
            if search_dir.exists():
                if search_dir.is_dir():
                    html_files = [
                        f for f in search_dir.iterdir()
                        if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
                    ]
                    if html_files:
                        return str(html_files[0].relative_to(link.link_dir))
                    # sometimes wget'd URLs have no ext and return non-html
                    # e.g. /some/example/rss/all -> some RSS XML content)
                    #      /some/other/url.o4g   -> some binary unrecognized ext)
                    # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
                    last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
                    for file_present in search_dir.iterdir():
                        if file_present == last_part_of_url:
                            return str((search_dir / file_present).relative_to(link.link_dir))
        except OSError:
            # OSError 36 and others can happen here, caused by trying to check for impossible paths
            # (paths derived from URLs can often contain illegal unicode characters or be too long,
            # causing the OS / filesystem to reject trying to open them with a system-level error)
            pass
        # Move up one directory level
        search_dir = search_dir.parent
        if str(search_dir) == link.link_dir:
            break
    # check for literally any file present that isnt an empty folder
    domain_dir = Path(domain(link.url).replace(":", "+"))
    files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
    if files_within:
        return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
    # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
    # that it's better we just pretend it doesnt exist
    # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
    return None
@enforce_types
 def wget_output_path(link, nocache: bool=False) -> Optional[str]:
    """calculate the path to the wgetted .html file, since wget may
    adjust some paths to be different than the base_url path.
    See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
    WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
    is basically impossible. Every OS and filesystem have different requirements on what special characters are
    allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
    that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
    accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
    wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
    complicated attempt to do this. Here be dragons:
        - https://github.com/ArchiveBox/ArchiveBox/issues/549
        - https://github.com/ArchiveBox/ArchiveBox/issues/1373
        - https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
        - and probably many more that I didn't realize were caused by this...
    The only constructive thing we could possibly do to this function is to figure out how to remove it.
    Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
    and pray you never have to deal with the aftermath of someone else's attempt to do so...
    """
    # Wget downloads can save in a number of different ways depending on the url:
    #    https://example.com
    #       > example.com/index.html
    #    https://example.com?v=zzVa_tX1OiI
    #       > example.com/index.html@v=zzVa_tX1OiI.html
    #    https://www.example.com/?v=zzVa_tX1OiI
    #       > example.com/index.html@v=zzVa_tX1OiI.html
    #    https://example.com/abc
    #       > example.com/abc.html
    #    https://example.com/abc/
    #       > example.com/abc/index.html
    #    https://example.com/abc?v=zzVa_tX1OiI.html
    #       > example.com/abc@v=zzVa_tX1OiI.html
    #    https://example.com/abc/?v=zzVa_tX1OiI.html
    #       > example.com/abc/index.html@v=zzVa_tX1OiI.html
    #    https://example.com/abc/test.html
    #       > example.com/abc/test.html
    #    https://example.com/abc/test?v=zzVa_tX1OiI
    #       > example.com/abc/test@v=zzVa_tX1OiI.html
    #    https://example.com/abc/test/?v=zzVa_tX1OiI
    #       > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
    cache_key = f'{link.url_hash}:{link.timestamp}-{link.downloaded_at and link.downloaded_at.timestamp()}-wget-output-path'
    if not nocache:
        from django.core.cache import cache
        cached_result = cache.get(cache_key)
        if cached_result:
            return cached_result
    # There's also lots of complexity around how the urlencoding and renaming
    # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
    # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
    # 4 characters, paths with multipe extensions, etc. the list goes on...
    output_path = None
    try:
        output_path = unsafe_wget_output_path(link)
    except Exception as err:
        pass           # better to pretend it just failed to download than expose gnarly OSErrors to users
    # check for unprintable unicode characters
    # https://github.com/ArchiveBox/ArchiveBox/issues/1373
    if output_path:
        safe_path = output_path.encode('utf-8', 'replace').decode()
        if output_path != safe_path:
            # contains unprintable unicode characters that will break other parts of archivebox
            # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
            output_path = None
    # check for a path that is just too long to safely handle across different OS's
    # https://github.com/ArchiveBox/ArchiveBox/issues/549
    if output_path and len(output_path) > 250:
        output_path = None
    if output_path:
        if not nocache:
            cache.set(cache_key, output_path)
        return output_path
    # fallback to just the domain dir
    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
    if search_dir.is_dir():
        return domain(link.url).replace(":", "+")
    # fallback to just the domain dir without port
    search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
    if search_dir.is_dir():
        return domain(link.url).split(":", 1)[0]
    return None