diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 921c258a..df2b01c8 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -78,7 +78,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, - 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'} + 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'}, 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'}, 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}, diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 7522ddb8..185a01cb 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -6,18 +6,18 @@ from typing import Optional, List, Dict, Tuple from collections import defaultdict from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, PIPE, DEVNULL, chmod_file +from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, ) from ..config import ( - VERSION, TIMEOUT, + CHECK_SSL_VALIDITY, SAVE_ARCHIVE_DOT_ORG, CURL_BINARY, CURL_VERSION, - CHECK_SSL_VALIDITY + CURL_USER_AGENT, ) from ..cli.logging import TimedProgress @@ -45,17 +45,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=T submit_url = 'https://web.archive.org/save/{}'.format(link.url) cmd = [ CURL_BINARY, + '--silent', '--location', '--head', - '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from '--max-time', str(timeout), + *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), submit_url, ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout) + result = run(cmd, cwd=out_dir, timeout=timeout) content_location, errors = parse_archive_dot_org_response(result.stdout) if content_location: archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index 331531c0..b46137b6 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -5,7 +5,7 @@ import os from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, PIPE, chmod_file +from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, @@ -47,7 +47,7 @@ def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A timer = TimedProgress(timeout, prefix=' ') try: with open(output_path, 'w+') as f: - result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout) + result = run(cmd, stdout=f, cwd=out_dir, timeout=timeout) if result.returncode: hints = result.stderr.decode() diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 40433a69..2f5e87ba 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -5,7 +5,7 @@ import os from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput -from ..system import chmod_file, run, PIPE +from ..system import chmod_file, run from ..util import enforce_types, domain from ..config import ( TIMEOUT, @@ -38,14 +38,14 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) '--max-time', str(timeout), '--location', '--output', str(output), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [], + *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), ] status = 'pending' timer = TimedProgress(timeout, prefix=' ') try: - run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + run(cmd, cwd=out_dir, timeout=timeout) chmod_file(output, cwd=out_dir) status = 'succeeded' except Exception as err: diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index 54e67d8b..75674ab8 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -5,7 +5,7 @@ import os from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, PIPE, chmod_file +from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, @@ -64,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) + result = run(cmd, cwd=output_path, timeout=timeout + 1) if result.returncode == 128: # ignore failed re-download when the folder already exists diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 861f3459..554f27c9 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -5,7 +5,7 @@ import os from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, PIPE, chmod_file +from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, @@ -66,7 +66,7 @@ def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEO status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) + result = run(cmd, cwd=output_path, timeout=timeout + 1) chmod_file(output, cwd=out_dir) if result.returncode: if (b'ERROR: Unsupported URL' in result.stderr diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index c29f3b22..3786c4cc 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -5,7 +5,7 @@ import os from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, PIPE, chmod_file +from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, @@ -45,7 +45,7 @@ def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + result = run(cmd, cwd=out_dir, timeout=timeout) if result.returncode: hints = (result.stderr or result.stdout).decode() diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index d2879c95..33936499 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -5,7 +5,7 @@ import os from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, PIPE, chmod_file +from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, @@ -45,7 +45,7 @@ def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + result = run(cmd, cwd=out_dir, timeout=timeout) if result.returncode: hints = (result.stderr or result.stdout).decode() diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 497c0ffb..b54d5a04 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -12,9 +12,11 @@ from ..util import ( ) from ..config import ( TIMEOUT, + CHECK_SSL_VALIDITY, SAVE_TITLE, CURL_BINARY, CURL_VERSION, + CURL_USER_AGENT, ) from ..cli.logging import TimedProgress @@ -44,6 +46,11 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> output: ArchiveOutput = None cmd = [ CURL_BINARY, + '--silent', + '--max-time', str(timeout), + '--location', + *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), + *([] if CHECK_SSL_VALIDITY else ['--insecure']), link.url, '|', 'grep', diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 4f6d7000..50d0111d 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -7,7 +7,7 @@ from typing import Optional from datetime import datetime from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, PIPE +from ..system import run from ..util import ( enforce_types, is_static_file, @@ -81,7 +81,7 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) + result = run(cmd, cwd=out_dir, timeout=timeout) output = wget_output_path(link) # parse out number of files downloaded from last line of stderr: diff --git a/archivebox/system.py b/archivebox/system.py index 4200ec9b..4f238ceb 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -4,69 +4,44 @@ __package__ = 'archivebox' import os import shutil -import json as pyjson +from json import dump +from pathlib import Path from typing import Optional, Union, Set, Tuple +from subprocess import run as subprocess_run from crontab import CronTab -from atomicwrites import atomic_write as awrite - -from subprocess import ( - Popen, - PIPE, - DEVNULL, - CompletedProcess, - TimeoutExpired, - CalledProcessError, -) +from atomicwrites import atomic_write as lib_atomic_write from .util import enforce_types, ExtendedEncoder from .config import OUTPUT_PERMISSIONS -def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): +def run(*args, input=None, capture_output=True, text=True, timeout=None, check=False, **kwargs): """Patched of subprocess.run to fix blocking io making timeout=innefective""" if input is not None: if 'stdin' in kwargs: raise ValueError('stdin and input arguments may not both be used.') - kwargs['stdin'] = PIPE if capture_output: if ('stdout' in kwargs) or ('stderr' in kwargs): raise ValueError('stdout and stderr arguments may not be used ' 'with capture_output.') - kwargs['stdout'] = PIPE - kwargs['stderr'] = PIPE - with Popen(*popenargs, **kwargs) as process: - try: - stdout, stderr = process.communicate(input, timeout=timeout) - except TimeoutExpired: - process.kill() - try: - stdout, stderr = process.communicate(input, timeout=2) - except: - pass - raise TimeoutExpired(popenargs[0][0], timeout) - except BaseException: - process.kill() - # We don't call process.wait() as .__exit__ does that for us. - raise - retcode = process.poll() - if check and retcode: - raise CalledProcessError(retcode, process.args, - output=stdout, stderr=stderr) - return CompletedProcess(process.args, retcode, stdout, stderr) + return subprocess_run(*args, input=input, capture_output=capture_output, text=text, timeout=timeout, check=check, **kwargs) - -def atomic_write(path: str, contents: Union[dict, str, bytes], overwrite: bool=True) -> None: +@enforce_types +def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None: """Safe atomic write to filesystem by writing to temp file + atomic rename""" - with awrite(path, overwrite=overwrite) as f: + mode = 'wb+' if isinstance(contents, bytes) else 'w' + + # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}') + with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f: if isinstance(contents, dict): - pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) - else: + dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) + elif isinstance(contents, (bytes, str)): f.write(contents) @enforce_types @@ -76,7 +51,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim if not os.path.exists(os.path.join(cwd, path)): raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) - chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) + chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, timeout=timeout) if chmod_result.returncode == 1: print(' ', chmod_result.stderr.decode()) raise Exception('Failed to chmod {}/{}'.format(cwd, path))