From 69522da4bbfef4e6c5a162b891c4631793d25f00 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 30 Sep 2024 21:43:45 -0700 Subject: [PATCH] move wget and mercury into plugins --- archivebox/config/legacy.py | 63 ++----- archivebox/extractors/mercury.py | 35 ++-- archivebox/extractors/wget.py | 56 +++--- archivebox/plugins_extractor/mercury/apps.py | 82 +++++++++ archivebox/plugins_extractor/wget/apps.py | 44 ++++- .../plugins_extractor/wget/wget_util.py | 168 ++++++++++++++++++ 6 files changed, 341 insertions(+), 107 deletions(-) create mode 100644 archivebox/plugins_extractor/mercury/apps.py create mode 100644 archivebox/plugins_extractor/wget/wget_util.py diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index 46695fd7..c7b88b4a 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -59,6 +59,7 @@ from ..misc.logging import ( from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG +from archivebox.plugins_extractor.wget.apps import WGET_CONFIG ANSI = SHELL_CONFIG.ANSI LDAP = LDAP_CONFIG.LDAP_ENABLED @@ -81,6 +82,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(), 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), + + 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), 'ARCHIVE_METHOD_TOGGLES': { @@ -112,7 +115,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'}, 'COOKIES_FILE': {'type': str, 'default': None}, @@ -143,16 +145,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, - 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - ]}, - 'WGET_EXTRA_ARGS': {'type': list, 'default': None}, 'CURL_ARGS': {'type': list, 'default': ['--silent', '--location', '--compressed' @@ -161,16 +153,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, 'SINGLEFILE_ARGS': {'type': list, 'default': None}, 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, - 'MERCURY_ARGS': {'type': list, 'default': ['--format=text']}, - 'MERCURY_EXTRA_ARGS': {'type': list, 'default': None}, }, 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, - 'USE_WGET': {'type': bool, 'default': True}, 'USE_SINGLEFILE': {'type': bool, 'default': True}, 'USE_READABILITY': {'type': bool, 'default': True}, - 'USE_MERCURY': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, @@ -178,8 +166,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, - 'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2 - 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')}, 'NODE_BINARY': {'type': str, 'default': 'node'}, # 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl # 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, @@ -232,21 +218,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']}, - 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])}, - 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None}, - 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False}, - # 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)}, - 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, - 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, - 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, - 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, - - 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, - 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY']}, - 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750 - 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []}, - 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []}, - 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, @@ -649,13 +620,13 @@ def get_dependency_info(config: benedict) -> ConfigValue: 'enabled': config['USE_CURL'], 'is_valid': bool(config['CURL_VERSION']), }, - 'WGET_BINARY': { - 'path': bin_path(config['WGET_BINARY']), - 'version': config['WGET_VERSION'], - 'hash': bin_hash(config['WGET_BINARY']), - 'enabled': config['USE_WGET'], - 'is_valid': bool(config['WGET_VERSION']), - }, + # 'WGET_BINARY': { + # 'path': bin_path(config['WGET_BINARY']), + # 'version': config['WGET_VERSION'], + # 'hash': bin_hash(config['WGET_BINARY']), + # 'enabled': config['USE_WGET'], + # 'is_valid': bool(config['WGET_VERSION']), + # }, # 'NODE_BINARY': { # 'path': bin_path(config['NODE_BINARY']), # 'version': config['NODE_VERSION'], @@ -663,13 +634,13 @@ def get_dependency_info(config: benedict) -> ConfigValue: # 'enabled': config['USE_NODE'], # 'is_valid': bool(config['NODE_VERSION']), # }, - 'MERCURY_BINARY': { - 'path': bin_path(config['MERCURY_BINARY']), - 'version': config['MERCURY_VERSION'], - 'hash': bin_hash(config['MERCURY_BINARY']), - 'enabled': config['USE_MERCURY'], - 'is_valid': bool(config['MERCURY_VERSION']), - }, + # 'MERCURY_BINARY': { + # 'path': bin_path(config['MERCURY_BINARY']), + # 'version': config['MERCURY_VERSION'], + # 'hash': bin_hash(config['MERCURY_BINARY']), + # 'enabled': config['USE_MERCURY'], + # 'is_valid': bool(config['MERCURY_VERSION']), + # }, 'GIT_BINARY': { 'path': bin_path(config['GIT_BINARY']), 'version': config['GIT_VERSION'], diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index d6c8f934..a0cb86fa 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -11,16 +11,9 @@ from archivebox.misc.system import run, atomic_write from archivebox.misc.util import ( enforce_types, is_static_file, - dedupe, -) -from ..config.legacy import ( - TIMEOUT, - SAVE_MERCURY, - DEPENDENCIES, - MERCURY_VERSION, - MERCURY_ARGS, - MERCURY_EXTRA_ARGS, ) +from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY + from ..logging_util import TimedProgress @@ -49,35 +42,36 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti if is_static_file(link.url): return False - out_dir = out_dir or Path(link.link_dir) + out_dir = Path(out_dir or link.link_dir) + if not overwrite and (out_dir / get_output_path()).exists(): return False - return SAVE_MERCURY + return MERCURY_CONFIG.SAVE_MERCURY @enforce_types -def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=MERCURY_CONFIG.MERCURY_TIMEOUT) -> ArchiveResult: """download reader friendly version using @postlight/mercury-parser""" out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() / get_output_path() output = get_output_path() + + mercury_binary = MERCURY_BINARY.load() + assert mercury_binary.abspath and mercury_binary.version status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: output_folder.mkdir(exist_ok=True) # later options take precedence - options = [ - *MERCURY_ARGS, - *MERCURY_EXTRA_ARGS, - ] # By default, get plain text version of article cmd = [ - DEPENDENCIES['MERCURY_BINARY']['path'], + str(mercury_binary.abspath), + *MERCURY_CONFIG.MERCURY_EXTRA_ARGS, + '--format=text', link.url, - *dedupe(options) ] result = run(cmd, cwd=out_dir, timeout=timeout) try: @@ -92,7 +86,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) # Get HTML version of article cmd = [ - DEPENDENCIES['MERCURY_BINARY']['path'], + str(mercury_binary.abspath), + *MERCURY_CONFIG.MERCURY_EXTRA_ARGS, link.url ] result = run(cmd, cwd=out_dir, timeout=timeout) @@ -119,7 +114,7 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=MERCURY_VERSION, + cmd_version=str(mercury_binary.version), output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index f96db5f9..5afc6442 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -6,7 +6,6 @@ from pathlib import Path from typing import Optional from datetime import datetime, timezone -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import ( enforce_types, @@ -17,22 +16,10 @@ from archivebox.misc.util import ( urldecode, dedupe, ) -from ..config.legacy import ( - WGET_ARGS, - WGET_EXTRA_ARGS, - TIMEOUT, - SAVE_WGET, - SAVE_WARC, - WGET_BINARY, - WGET_VERSION, - RESTRICT_FILE_NAMES, - CHECK_SSL_VALIDITY, - SAVE_WGET_REQUISITES, - WGET_AUTO_COMPRESSION, - WGET_USER_AGENT, - COOKIES_FILE, -) +from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG + from ..logging_util import TimedProgress +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError def get_output_path(): @@ -54,38 +41,43 @@ def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Option if not overwrite and output_path and (out_dir / output_path).exists(): return False - return SAVE_WGET + return WGET_CONFIG.SAVE_WGET @enforce_types -def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=WGET_CONFIG.WGET_TIMEOUT) -> ArchiveResult: """download full site using wget""" - out_dir = out_dir or link.link_dir - if SAVE_WARC: + out_dir = Path(out_dir or link.link_dir) + assert out_dir.exists() + + if WGET_CONFIG.SAVE_WARC: warc_dir = out_dir / "warc" warc_dir.mkdir(exist_ok=True) warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) + wget_binary = WGET_BINARY.load() + assert wget_binary.abspath and wget_binary.version + # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output: ArchiveOutput = None # later options take precedence options = [ - *WGET_ARGS, - *WGET_EXTRA_ARGS, + *WGET_CONFIG.WGET_ARGS, + *WGET_CONFIG.WGET_EXTRA_ARGS, '--timeout={}'.format(timeout), - *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), - *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), - *(['--page-requisites'] if SAVE_WGET_REQUISITES else []), - *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []), - *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []), - *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), - *([] if SAVE_WARC else ['--timestamping']), - *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), + *(['--restrict-file-names={}'.format(WGET_CONFIG.WGET_RESTRICT_FILE_NAMES)] if WGET_CONFIG.WGET_RESTRICT_FILE_NAMES else []), + *(['--warc-file={}'.format(str(warc_path))] if WGET_CONFIG.SAVE_WARC else []), + *(['--page-requisites'] if WGET_CONFIG.SAVE_WGET_REQUISITES else []), + *(['--user-agent={}'.format(WGET_CONFIG.WGET_USER_AGENT)] if WGET_CONFIG.WGET_USER_AGENT else []), + *(['--load-cookies', str(WGET_CONFIG.WGET_COOKIES_FILE)] if WGET_CONFIG.WGET_COOKIES_FILE else []), + *(['--compression=auto'] if WGET_CONFIG.WGET_AUTO_COMPRESSION else []), + *([] if WGET_CONFIG.SAVE_WARC else ['--timestamping']), + *([] if WGET_CONFIG.WGET_CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), # '--server-response', # print headers for better error parsing ] cmd = [ - WGET_BINARY, + str(wget_binary.abspath), *dedupe(options), link.url, ] @@ -137,7 +129,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=WGET_VERSION, + cmd_version=str(wget_binary.version), output=output, status=status, **timer.stats, diff --git a/archivebox/plugins_extractor/mercury/apps.py b/archivebox/plugins_extractor/mercury/apps.py new file mode 100644 index 00000000..78d505b2 --- /dev/null +++ b/archivebox/plugins_extractor/mercury/apps.py @@ -0,0 +1,82 @@ +__package__ = 'plugins_extractor.mercury' + +from typing import List, Optional, Dict +from pathlib import Path +from subprocess import run + +from pydantic import InstanceOf, Field +from pydantic_pkgr import BinProvider, BinName, bin_abspath + +from abx.archivebox.base_plugin import BasePlugin, BaseHook +from abx.archivebox.base_configset import BaseConfigSet +from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG +from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +class MercuryConfig(BaseConfigSet): + + SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY') + + MERCURY_BINARY: str = Field(default='postlight-parser') + MERCURY_EXTRA_ARGS: List[str] = [] + + SAVE_MERCURY_REQUISITES: bool = Field(default=True) + MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) + + MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + + +MERCURY_CONFIG = MercuryConfig() + + +class MercuryBinary(BaseBinary): + name: BinName = MERCURY_CONFIG.MERCURY_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { + LIB_NPM_BINPROVIDER.name: { + 'packages': lambda: ['@postlight/parser@^2.2.3'], + 'version': lambda: run([str(LIB_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), f'--prefix={LIB_NPM_BINPROVIDER.npm_prefix}', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(), + }, + SYS_NPM_BINPROVIDER.name: { + 'packages': lambda: [], # never try to install things globally + 'version': lambda: run([str(SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH), '-g', 'info', '@postlight/parser', 'version'], text=True, capture_output=True).stdout.strip(), + }, + env.name: { + 'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None, + }, + } + +MERCURY_BINARY = MercuryBinary() + + +class MercuryExtractor(BaseExtractor): + name: ExtractorName = 'mercury' + binary: str = MERCURY_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + return snapshot.link_dir / 'mercury' / 'content.html' + +MERCURY_EXTRACTOR = MercuryExtractor() + + + +class MercuryPlugin(BasePlugin): + app_label: str = 'mercury' + verbose_name: str = 'MERCURY' + + hooks: List[InstanceOf[BaseHook]] = [ + MERCURY_CONFIG, + MERCURY_BINARY, + MERCURY_EXTRACTOR, + ] + + +PLUGIN = MercuryPlugin() +DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/wget/apps.py b/archivebox/plugins_extractor/wget/apps.py index 19db12f0..e272df06 100644 --- a/archivebox/plugins_extractor/wget/apps.py +++ b/archivebox/plugins_extractor/wget/apps.py @@ -1,17 +1,21 @@ +__package__ = 'plugins_extractor.wget' + import sys -from typing import List, Optional +from typing import List, Optional, Dict from pathlib import Path +from subprocess import run, DEVNULL from rich import print from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import BinProvider, BinName +from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict from abx.archivebox.base_plugin import BasePlugin, BaseHook from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, env, apt, brew from abx.archivebox.base_extractor import BaseExtractor, ExtractorName -from archivebox.extractors.wget import wget_output_path +from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG +from .wget_util import wget_output_path class WgetConfig(BaseConfigSet): @@ -34,13 +38,13 @@ class WgetConfig(BaseConfigSet): ] WGET_EXTRA_ARGS: List[str] = [] - WGET_AUTO_COMPRESSION: bool = Field(default=True) SAVE_WGET_REQUISITES: bool = Field(default=True) - WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT') - WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT') - WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY') - WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES') - WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE') + WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) + + WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) @model_validator(mode='after') def validate_use_ytdlp(self): @@ -53,6 +57,22 @@ class WgetConfig(BaseConfigSet): print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr) print(file=sys.stderr) return self + + @property + def WGET_AUTO_COMPRESSION(self) -> bool: + if hasattr(self, '_WGET_AUTO_COMPRESSION'): + return self._WGET_AUTO_COMPRESSION + try: + cmd = [ + self.WGET_BINARY, + "--compression=auto", + "--help", + ] + self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode + return self._WGET_AUTO_COMPRESSION + except (FileNotFoundError, OSError): + self._WGET_AUTO_COMPRESSION = False + return False WGET_CONFIG = WgetConfig() @@ -60,6 +80,12 @@ WGET_CONFIG = WgetConfig() class WgetBinary(BaseBinary): name: BinName = WGET_CONFIG.WGET_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { + brew.name: { + 'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'), + }, + } WGET_BINARY = WgetBinary() diff --git a/archivebox/plugins_extractor/wget/wget_util.py b/archivebox/plugins_extractor/wget/wget_util.py new file mode 100644 index 00000000..84c07668 --- /dev/null +++ b/archivebox/plugins_extractor/wget/wget_util.py @@ -0,0 +1,168 @@ +__package__ = 'archivebox.extractors' + +import re +from pathlib import Path + +from typing import Optional + + +from archivebox.misc.util import ( + enforce_types, + without_fragment, + without_query, + path, + domain, + urldecode, +) + +@enforce_types +def unsafe_wget_output_path(link) -> Optional[str]: + # There used to be a bunch of complex reverse-engineering path mapping logic here, + # but it was removed in favor of just walking through the output folder recursively to try to find the + # html file that wget produced. It's *much much much* slower than deriving it statically, and is currently + # one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount). + # But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles. + full_path = without_fragment(without_query(path(link.url))).strip('/') + search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path) + for _ in range(4): + try: + if search_dir.exists(): + if search_dir.is_dir(): + html_files = [ + f for f in search_dir.iterdir() + if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) + ] + if html_files: + return str(html_files[0].relative_to(link.link_dir)) + + # sometimes wget'd URLs have no ext and return non-html + # e.g. /some/example/rss/all -> some RSS XML content) + # /some/other/url.o4g -> some binary unrecognized ext) + # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all + last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) + for file_present in search_dir.iterdir(): + if file_present == last_part_of_url: + return str((search_dir / file_present).relative_to(link.link_dir)) + except OSError: + # OSError 36 and others can happen here, caused by trying to check for impossible paths + # (paths derived from URLs can often contain illegal unicode characters or be too long, + # causing the OS / filesystem to reject trying to open them with a system-level error) + pass + + # Move up one directory level + search_dir = search_dir.parent + + if str(search_dir) == link.link_dir: + break + + # check for literally any file present that isnt an empty folder + domain_dir = Path(domain(link.url).replace(":", "+")) + files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')] + if files_within: + return str((domain_dir / files_within[-1]).relative_to(link.link_dir)) + + # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated + # that it's better we just pretend it doesnt exist + # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools + return None + + +@enforce_types +def wget_output_path(link, nocache: bool=False) -> Optional[str]: + """calculate the path to the wgetted .html file, since wget may + adjust some paths to be different than the base_url path. + + See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links + + WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically + is basically impossible. Every OS and filesystem have different requirements on what special characters are + allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings + that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont + accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer + wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely + complicated attempt to do this. Here be dragons: + - https://github.com/ArchiveBox/ArchiveBox/issues/549 + - https://github.com/ArchiveBox/ArchiveBox/issues/1373 + - https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta + - and probably many more that I didn't realize were caused by this... + + The only constructive thing we could possibly do to this function is to figure out how to remove it. + + Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths, + and pray you never have to deal with the aftermath of someone else's attempt to do so... + """ + + # Wget downloads can save in a number of different ways depending on the url: + # https://example.com + # > example.com/index.html + # https://example.com?v=zzVa_tX1OiI + # > example.com/index.html@v=zzVa_tX1OiI.html + # https://www.example.com/?v=zzVa_tX1OiI + # > example.com/index.html@v=zzVa_tX1OiI.html + + # https://example.com/abc + # > example.com/abc.html + # https://example.com/abc/ + # > example.com/abc/index.html + # https://example.com/abc?v=zzVa_tX1OiI.html + # > example.com/abc@v=zzVa_tX1OiI.html + # https://example.com/abc/?v=zzVa_tX1OiI.html + # > example.com/abc/index.html@v=zzVa_tX1OiI.html + + # https://example.com/abc/test.html + # > example.com/abc/test.html + # https://example.com/abc/test?v=zzVa_tX1OiI + # > example.com/abc/test@v=zzVa_tX1OiI.html + # https://example.com/abc/test/?v=zzVa_tX1OiI + # > example.com/abc/test/index.html@v=zzVa_tX1OiI.html + + cache_key = f'{link.url_hash}:{link.timestamp}-{link.downloaded_at and link.downloaded_at.timestamp()}-wget-output-path' + + if not nocache: + from django.core.cache import cache + cached_result = cache.get(cache_key) + if cached_result: + return cached_result + + + # There's also lots of complexity around how the urlencoding and renaming + # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc, + # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than + # 4 characters, paths with multipe extensions, etc. the list goes on... + + output_path = None + try: + output_path = unsafe_wget_output_path(link) + except Exception as err: + pass # better to pretend it just failed to download than expose gnarly OSErrors to users + + # check for unprintable unicode characters + # https://github.com/ArchiveBox/ArchiveBox/issues/1373 + if output_path: + safe_path = output_path.encode('utf-8', 'replace').decode() + if output_path != safe_path: + # contains unprintable unicode characters that will break other parts of archivebox + # better to pretend it doesnt exist and fallback to parent dir than crash archivebox + output_path = None + + # check for a path that is just too long to safely handle across different OS's + # https://github.com/ArchiveBox/ArchiveBox/issues/549 + if output_path and len(output_path) > 250: + output_path = None + + if output_path: + if not nocache: + cache.set(cache_key, output_path) + return output_path + + # fallback to just the domain dir + search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") + if search_dir.is_dir(): + return domain(link.url).replace(":", "+") + + # fallback to just the domain dir without port + search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0] + if search_dir.is_dir(): + return domain(link.url).split(":", 1)[0] + + return None