move wget and mercury into plugins

2024-11-22 12:13:05 +00:00 · 2024-09-30 21:43:45 -07:00 · 2024-09-30 21:43:45 -07:00 · 69522da4bb
commit 69522da4bb
parent dce79d63c6
6 changed files with 341 additions and 107 deletions
--- a/archivebox/config/legacy.py
+++ b/archivebox/config/legacy.py
@ -59,6 +59,7 @@ from ..misc.logging import (
 from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
 from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
+from archivebox.plugins_extractor.wget.apps import WGET_CONFIG

 ANSI = SHELL_CONFIG.ANSI
 LDAP = LDAP_CONFIG.LDAP_ENABLED
@ -82,6 +83,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
    
    'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
    
+    'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
+

    'ARCHIVE_METHOD_TOGGLES': {
        'SAVE_TITLE':               {'type': bool,  'default': True, 'aliases': ('FETCH_TITLE',)},
@ -112,7 +115,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {

        'USER_AGENT':               {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, #  + ' wget/{WGET_VERSION}'},

        'COOKIES_FILE':             {'type': str,   'default': None},

@ -143,16 +145,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'YOUTUBEDL_EXTRA_ARGS':     {'type': list,  'default': None},


-        'WGET_ARGS':                {'type': list,  'default': ['--no-verbose',
-                                                                '--adjust-extension',
-                                                                '--convert-links',
-                                                                '--force-directories',
-                                                                '--backup-converted',
-                                                                '--span-hosts',
-                                                                '--no-parent',
-                                                                '-e', 'robots=off',
-                                                                ]},
-        'WGET_EXTRA_ARGS':          {'type': list,  'default': None},
        'CURL_ARGS':                {'type': list,  'default': ['--silent',
                                                                '--location',
                                                                '--compressed'
@ -161,16 +153,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
-        'MERCURY_ARGS':             {'type': list,  'default': ['--format=text']},
-        'MERCURY_EXTRA_ARGS':       {'type': list,  'default': None},
    },

    'DEPENDENCY_CONFIG': {
        'USE_CURL':                 {'type': bool,  'default': True},
-        'USE_WGET':                 {'type': bool,  'default': True},
        'USE_SINGLEFILE':           {'type': bool,  'default': True},
        'USE_READABILITY':          {'type': bool,  'default': True},
-        'USE_MERCURY':              {'type': bool,  'default': True},
        'USE_GIT':                  {'type': bool,  'default': True},
        'USE_CHROME':               {'type': bool,  'default': True},
        'USE_YOUTUBEDL':            {'type': bool,  'default': True},
@ -178,8 +166,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {

        'CURL_BINARY':              {'type': str,   'default': 'curl'},
        'GIT_BINARY':               {'type': str,   'default': 'git'},
-        'WGET_BINARY':              {'type': str,   'default': 'wget'},     # also can accept wget2
-        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('postlight-parser')},
        'NODE_BINARY':              {'type': str,   'default': 'node'},
        # 'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
        # 'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
@ -232,21 +218,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
    'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},

-    'USE_WGET':                 {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
-    'WGET_VERSION':             {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
-    'WGET_AUTO_COMPRESSION':    {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
-    # 'WGET_USER_AGENT':          {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
-    'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
-    'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
-    'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
-    'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
-
-    'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
-    'SAVE_MERCURY':             {'default': lambda c: c['USE_MERCURY']},
-    'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
-    'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
-    'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
-
    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
    'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
@ -649,13 +620,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
            'enabled': config['USE_CURL'],
            'is_valid': bool(config['CURL_VERSION']),
        },
-        'WGET_BINARY': {
-            'path': bin_path(config['WGET_BINARY']),
-            'version': config['WGET_VERSION'],
-            'hash': bin_hash(config['WGET_BINARY']),
-            'enabled': config['USE_WGET'],
-            'is_valid': bool(config['WGET_VERSION']),
-        },
+        # 'WGET_BINARY': {
+        #     'path': bin_path(config['WGET_BINARY']),
+        #     'version': config['WGET_VERSION'],
+        #     'hash': bin_hash(config['WGET_BINARY']),
+        #     'enabled': config['USE_WGET'],
+        #     'is_valid': bool(config['WGET_VERSION']),
+        # },
        # 'NODE_BINARY': {
        #     'path': bin_path(config['NODE_BINARY']),
        #     'version': config['NODE_VERSION'],
@ -663,13 +634,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
        #     'enabled': config['USE_NODE'],
        #     'is_valid': bool(config['NODE_VERSION']),
        # },
-        'MERCURY_BINARY': {
-            'path': bin_path(config['MERCURY_BINARY']),
-            'version': config['MERCURY_VERSION'],
-            'hash': bin_hash(config['MERCURY_BINARY']),
-            'enabled': config['USE_MERCURY'],
-            'is_valid': bool(config['MERCURY_VERSION']),
-        },
+        # 'MERCURY_BINARY': {
+        #     'path': bin_path(config['MERCURY_BINARY']),
+        #     'version': config['MERCURY_VERSION'],
+        #     'hash': bin_hash(config['MERCURY_BINARY']),
+        #     'enabled': config['USE_MERCURY'],
+        #     'is_valid': bool(config['MERCURY_VERSION']),
+        # },
        'GIT_BINARY': {
            'path': bin_path(config['GIT_BINARY']),
            'version': config['GIT_VERSION'],
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@ -11,16 +11,9 @@ from archivebox.misc.system import run, atomic_write
 from archivebox.misc.util import (
    enforce_types,
    is_static_file,
-    dedupe,
-)
-from ..config.legacy import (
-    TIMEOUT,
-    SAVE_MERCURY,
-    DEPENDENCIES,
-    MERCURY_VERSION,
-    MERCURY_ARGS,
-    MERCURY_EXTRA_ARGS,
 )
+from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
+
 from ..logging_util import TimedProgress


@ -49,35 +42,36 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
    if is_static_file(link.url):
        return False

-    out_dir = out_dir or Path(link.link_dir)
+    out_dir = Path(out_dir or link.link_dir)
+
    if not overwrite and (out_dir / get_output_path()).exists():
        return False

-    return SAVE_MERCURY
+    return MERCURY_CONFIG.SAVE_MERCURY


@enforce_types
-def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=MERCURY_CONFIG.MERCURY_TIMEOUT) -> ArchiveResult:
    """download reader friendly version using @postlight/mercury-parser"""

    out_dir = Path(out_dir or link.link_dir)
    output_folder = out_dir.absolute() / get_output_path()
    output = get_output_path()
    
+    mercury_binary = MERCURY_BINARY.load()
+    assert mercury_binary.abspath and mercury_binary.version
+
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        output_folder.mkdir(exist_ok=True)
        # later options take precedence
-        options = [
-            *MERCURY_ARGS,
-            *MERCURY_EXTRA_ARGS,
-        ]
        # By default, get plain text version of article
        cmd = [
-            DEPENDENCIES['MERCURY_BINARY']['path'],
+            str(mercury_binary.abspath),
+            *MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
+            '--format=text',
            link.url,
-            *dedupe(options)
        ]
        result = run(cmd, cwd=out_dir, timeout=timeout)
        try:
@ -92,7 +86,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)

        # Get HTML version of article
        cmd = [
-            DEPENDENCIES['MERCURY_BINARY']['path'],
+            str(mercury_binary.abspath),
+            *MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
            link.url
        ]
        result = run(cmd, cwd=out_dir, timeout=timeout)
@ -119,7 +114,7 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
    return ArchiveResult(
        cmd=cmd,
        pwd=str(out_dir),
-        cmd_version=MERCURY_VERSION,
+        cmd_version=str(mercury_binary.version),
        output=output,
        status=status,
        **timer.stats,
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -6,7 +6,6 @@ from pathlib import Path
 from typing import Optional
 from datetime import datetime, timezone

-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import (
    enforce_types,
@ -17,22 +16,10 @@ from archivebox.misc.util import (
    urldecode,
    dedupe,
 )
-from ..config.legacy import (
-    WGET_ARGS,
-    WGET_EXTRA_ARGS,
-    TIMEOUT,
-    SAVE_WGET,
-    SAVE_WARC,
-    WGET_BINARY,
-    WGET_VERSION,
-    RESTRICT_FILE_NAMES,
-    CHECK_SSL_VALIDITY,
-    SAVE_WGET_REQUISITES,
-    WGET_AUTO_COMPRESSION,
-    WGET_USER_AGENT,
-    COOKIES_FILE,
-)
+from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
+
 from ..logging_util import TimedProgress
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError


 def get_output_path():
@ -54,38 +41,43 @@ def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Option
    if not overwrite and output_path and (out_dir / output_path).exists():
        return False

-    return SAVE_WGET
+    return WGET_CONFIG.SAVE_WGET


@enforce_types
-def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=WGET_CONFIG.WGET_TIMEOUT) -> ArchiveResult:
    """download full site using wget"""

-    out_dir = out_dir or link.link_dir
-    if SAVE_WARC:
+    out_dir = Path(out_dir or link.link_dir)
+    assert out_dir.exists()
+    
+    if WGET_CONFIG.SAVE_WARC:
        warc_dir = out_dir / "warc"
        warc_dir.mkdir(exist_ok=True)
        warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))

+    wget_binary = WGET_BINARY.load()
+    assert wget_binary.abspath and wget_binary.version
+
    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output: ArchiveOutput = None
    # later options take precedence
    options = [
-        *WGET_ARGS,
-        *WGET_EXTRA_ARGS,
+        *WGET_CONFIG.WGET_ARGS,
+        *WGET_CONFIG.WGET_EXTRA_ARGS,
        '--timeout={}'.format(timeout),
-        *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
-        *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
-        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
-        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
-        *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
-        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
-        *([] if SAVE_WARC else ['--timestamping']),
-        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+        *(['--restrict-file-names={}'.format(WGET_CONFIG.WGET_RESTRICT_FILE_NAMES)] if WGET_CONFIG.WGET_RESTRICT_FILE_NAMES else []),
+        *(['--warc-file={}'.format(str(warc_path))] if WGET_CONFIG.SAVE_WARC else []),
+        *(['--page-requisites'] if WGET_CONFIG.SAVE_WGET_REQUISITES else []),
+        *(['--user-agent={}'.format(WGET_CONFIG.WGET_USER_AGENT)] if WGET_CONFIG.WGET_USER_AGENT else []),
+        *(['--load-cookies', str(WGET_CONFIG.WGET_COOKIES_FILE)] if WGET_CONFIG.WGET_COOKIES_FILE else []),
+        *(['--compression=auto'] if WGET_CONFIG.WGET_AUTO_COMPRESSION else []),
+        *([] if WGET_CONFIG.SAVE_WARC else ['--timestamping']),
+        *([] if WGET_CONFIG.WGET_CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
        # '--server-response',  # print headers for better error parsing
    ]
    cmd = [
-        WGET_BINARY,
+        str(wget_binary.abspath),
        *dedupe(options),
        link.url,
    ]
@ -137,7 +129,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
    return ArchiveResult(
        cmd=cmd,
        pwd=str(out_dir),
-        cmd_version=WGET_VERSION,
+        cmd_version=str(wget_binary.version),
        output=output,
        status=status,
        **timer.stats,
--- a/archivebox/plugins_extractor/mercury/apps.py
+++ b/archivebox/plugins_extractor/mercury/apps.py
@ -0,0 +1,82 @@
+__package__ = 'plugins_extractor.mercury'
+
+from typing import List, Optional, Dict
+from pathlib import Path
+from subprocess import run
+
+from pydantic import InstanceOf, Field
+from pydantic_pkgr import BinProvider, BinName, bin_abspath
+
+from abx.archivebox.base_plugin import BasePlugin, BaseHook
+from abx.archivebox.base_configset import BaseConfigSet
+from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env
+from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
+
+from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
+from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+class MercuryConfig(BaseConfigSet):
+
+    SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
+    
+    MERCURY_BINARY: str = Field(default='postlight-parser')
+    MERCURY_EXTRA_ARGS: List[str] = []
+    
+    SAVE_MERCURY_REQUISITES: bool = Field(default=True)
+    MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
+    
+    MERCURY_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+    
+
+
+MERCURY_CONFIG = MercuryConfig()
+
+
+class MercuryBinary(BaseBinary):
+    name: BinName = MERCURY_CONFIG.MERCURY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+        LIB_NPM_BINPROVIDER.name: {
+            'packages': lambda: ['@postlight/parser@^2.2.3'],
+            'version': lambda: run([str(LIB_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), f'--prefix={LIB_NPM_BINPROVIDER.npm_prefix}', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(),
+        },
+        SYS_NPM_BINPROVIDER.name: {
+            'packages': lambda: [],   # never try to install things globally
+            'version': lambda: run([str(SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), '-g', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(),
+        },
+        env.name: {
+            'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
+        },
+    }
+
+MERCURY_BINARY = MercuryBinary()
+
+
+class MercuryExtractor(BaseExtractor):
+    name: ExtractorName = 'mercury'
+    binary: str = MERCURY_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path | None:
+        return snapshot.link_dir / 'mercury' / 'content.html'
+
+MERCURY_EXTRACTOR = MercuryExtractor()
+
+
+
+class MercuryPlugin(BasePlugin):
+    app_label: str = 'mercury'
+    verbose_name: str = 'MERCURY'
+    
+    hooks: List[InstanceOf[BaseHook]] = [
+        MERCURY_CONFIG,
+        MERCURY_BINARY,
+        MERCURY_EXTRACTOR,
+    ]
+
+
+PLUGIN = MercuryPlugin()
+DJANGO_APP = PLUGIN.AppConfig
--- a/archivebox/plugins_extractor/wget/apps.py
+++ b/archivebox/plugins_extractor/wget/apps.py
@ -1,17 +1,21 @@
+__package__ = 'plugins_extractor.wget'
+
 import sys
-from typing import List, Optional
+from typing import List, Optional, Dict
 from pathlib import Path
+from subprocess import run, DEVNULL

 from rich import print
 from pydantic import InstanceOf, Field, model_validator
-from pydantic_pkgr import BinProvider, BinName
+from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict

 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName

-from archivebox.extractors.wget import wget_output_path
+from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
+from .wget_util import wget_output_path


 class WgetConfig(BaseConfigSet):
@ -34,13 +38,13 @@ class WgetConfig(BaseConfigSet):
    ]
    WGET_EXTRA_ARGS: List[str] = []
    
-    WGET_AUTO_COMPRESSION: bool = Field(default=True)
    SAVE_WGET_REQUISITES: bool = Field(default=True)
-    WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT')
-    WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
-    WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
-    WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
-    WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
+    WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
+    
+    WGET_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
    
    @model_validator(mode='after')
    def validate_use_ytdlp(self):
@ -54,6 +58,22 @@ class WgetConfig(BaseConfigSet):
            print(file=sys.stderr)
        return self
    
+    @property
+    def WGET_AUTO_COMPRESSION(self) -> bool:
+        if hasattr(self, '_WGET_AUTO_COMPRESSION'):
+            return self._WGET_AUTO_COMPRESSION
+        try:
+            cmd = [
+                self.WGET_BINARY,
+                "--compression=auto",
+                "--help",
+            ]
+            self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
+            return self._WGET_AUTO_COMPRESSION
+        except (FileNotFoundError, OSError):
+            self._WGET_AUTO_COMPRESSION = False
+            return False
+
 WGET_CONFIG = WgetConfig()


@ -61,6 +81,12 @@ class WgetBinary(BaseBinary):
    name: BinName = WGET_CONFIG.WGET_BINARY
    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
    
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+        brew.name: {
+            'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
+        },
+    }
+
 WGET_BINARY = WgetBinary()


--- a/archivebox/plugins_extractor/wget/wget_util.py
+++ b/archivebox/plugins_extractor/wget/wget_util.py
@ -0,0 +1,168 @@
+__package__ = 'archivebox.extractors'
+
+import re
+from pathlib import Path
+
+from typing import Optional
+
+
+from archivebox.misc.util import (
+    enforce_types,
+    without_fragment,
+    without_query,
+    path,
+    domain,
+    urldecode,
+)
+
+@enforce_types
+def unsafe_wget_output_path(link) -> Optional[str]:
+    # There used to be a bunch of complex reverse-engineering path mapping logic here,
+    # but it was removed in favor of just walking through the output folder recursively to try to find the
+    # html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
+    # one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
+    # But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
+    full_path = without_fragment(without_query(path(link.url))).strip('/')
+    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
+    for _ in range(4):
+        try:
+            if search_dir.exists():
+                if search_dir.is_dir():
+                    html_files = [
+                        f for f in search_dir.iterdir()
+                        if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
+                    ]
+                    if html_files:
+                        return str(html_files[0].relative_to(link.link_dir))
+
+                    # sometimes wget'd URLs have no ext and return non-html
+                    # e.g. /some/example/rss/all -> some RSS XML content)
+                    #      /some/other/url.o4g   -> some binary unrecognized ext)
+                    # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
+                    last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
+                    for file_present in search_dir.iterdir():
+                        if file_present == last_part_of_url:
+                            return str((search_dir / file_present).relative_to(link.link_dir))
+        except OSError:
+            # OSError 36 and others can happen here, caused by trying to check for impossible paths
+            # (paths derived from URLs can often contain illegal unicode characters or be too long,
+            # causing the OS / filesystem to reject trying to open them with a system-level error)
+            pass
+
+        # Move up one directory level
+        search_dir = search_dir.parent
+
+        if str(search_dir) == link.link_dir:
+            break
+
+    # check for literally any file present that isnt an empty folder
+    domain_dir = Path(domain(link.url).replace(":", "+"))
+    files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
+    if files_within:
+        return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
+
+    # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
+    # that it's better we just pretend it doesnt exist
+    # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
+    return None
+
+
+@enforce_types
+def wget_output_path(link, nocache: bool=False) -> Optional[str]:
+    """calculate the path to the wgetted .html file, since wget may
+    adjust some paths to be different than the base_url path.
+
+    See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
+
+    WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
+    is basically impossible. Every OS and filesystem have different requirements on what special characters are
+    allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
+    that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
+    accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
+    wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
+    complicated attempt to do this. Here be dragons:
+        - https://github.com/ArchiveBox/ArchiveBox/issues/549
+        - https://github.com/ArchiveBox/ArchiveBox/issues/1373
+        - https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
+        - and probably many more that I didn't realize were caused by this...
+
+    The only constructive thing we could possibly do to this function is to figure out how to remove it.
+
+    Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
+    and pray you never have to deal with the aftermath of someone else's attempt to do so...
+    """
+    
+    # Wget downloads can save in a number of different ways depending on the url:
+    #    https://example.com
+    #       > example.com/index.html
+    #    https://example.com?v=zzVa_tX1OiI
+    #       > example.com/index.html@v=zzVa_tX1OiI.html
+    #    https://www.example.com/?v=zzVa_tX1OiI
+    #       > example.com/index.html@v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc
+    #       > example.com/abc.html
+    #    https://example.com/abc/
+    #       > example.com/abc/index.html
+    #    https://example.com/abc?v=zzVa_tX1OiI.html
+    #       > example.com/abc@v=zzVa_tX1OiI.html
+    #    https://example.com/abc/?v=zzVa_tX1OiI.html
+    #       > example.com/abc/index.html@v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc/test.html
+    #       > example.com/abc/test.html
+    #    https://example.com/abc/test?v=zzVa_tX1OiI
+    #       > example.com/abc/test@v=zzVa_tX1OiI.html
+    #    https://example.com/abc/test/?v=zzVa_tX1OiI
+    #       > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
+
+    cache_key = f'{link.url_hash}:{link.timestamp}-{link.downloaded_at and link.downloaded_at.timestamp()}-wget-output-path'
+    
+    if not nocache:
+        from django.core.cache import cache
+        cached_result = cache.get(cache_key)
+        if cached_result:
+            return cached_result
+
+
+    # There's also lots of complexity around how the urlencoding and renaming
+    # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
+    # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
+    # 4 characters, paths with multipe extensions, etc. the list goes on...
+
+    output_path = None
+    try:
+        output_path = unsafe_wget_output_path(link)
+    except Exception as err:
+        pass           # better to pretend it just failed to download than expose gnarly OSErrors to users
+
+    # check for unprintable unicode characters
+    # https://github.com/ArchiveBox/ArchiveBox/issues/1373
+    if output_path:
+        safe_path = output_path.encode('utf-8', 'replace').decode()
+        if output_path != safe_path:
+            # contains unprintable unicode characters that will break other parts of archivebox
+            # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
+            output_path = None
+
+    # check for a path that is just too long to safely handle across different OS's
+    # https://github.com/ArchiveBox/ArchiveBox/issues/549
+    if output_path and len(output_path) > 250:
+        output_path = None
+
+    if output_path:
+        if not nocache:
+            cache.set(cache_key, output_path)
+        return output_path
+
+    # fallback to just the domain dir
+    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
+    if search_dir.is_dir():
+        return domain(link.url).replace(":", "+")
+
+    # fallback to just the domain dir without port
+    search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
+    if search_dir.is_dir():
+        return domain(link.url).split(":", 1)[0]
+
+    return None