diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 815be551..a9429d42 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -35,6 +35,7 @@ from .dom import should_save_dom, save_dom from .git import should_save_git, save_git from .media import should_save_media, save_media from .archive_org import should_save_archive_dot_org, save_archive_dot_org +from .headers import should_save_headers, save_headers def get_default_archive_methods(): return [ @@ -49,6 +50,7 @@ def get_default_archive_methods(): ('mercury', should_save_mercury, save_mercury), ('git', should_save_git, save_git), ('media', should_save_media, save_media), + ('headers', should_save_headers, save_headers), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py new file mode 100644 index 00000000..9987d4ed --- /dev/null +++ b/archivebox/extractors/headers.py @@ -0,0 +1,76 @@ +__package__ = 'archivebox.extractors' + +from pathlib import Path +from tempfile import NamedTemporaryFile + +from typing import Optional +import json + +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..system import run, atomic_write +from ..util import ( + enforce_types, + get_headers, + is_static_file, +) +from ..config import ( + TIMEOUT, + CURL_BINARY, + CURL_USER_AGENT, + CURL_VERSION, + CHECK_SSL_VALIDITY, + DEPENDENCIES, +) +from ..logging_util import TimedProgress + +@enforce_types +def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False + + output = Path(out_dir or link.link_dir) / 'headers.json' + return not output.exists() + + +@enforce_types +def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """Download site headers""" + + out_dir = Path(out_dir or link.link_dir) + output_folder = out_dir.absolute() + output: ArchiveOutput = 'headers.json' + + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + + cmd = [ + CURL_BINARY, + '-s', + '-I', + '-X', + '-D', + *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), + *([] if CHECK_SSL_VALIDITY else ['--insecure']), + link.url, + ] + try: + json_headers = get_headers(link.url) + + output_folder.mkdir(exist_ok=True) + atomic_write(str(output_folder / "headers.json"), json_headers) + + except (Exception, OSError) as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=str(out_dir), + cmd_version=CURL_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/util.py b/archivebox/util.py index f5a6e2d7..43994bbd 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -173,6 +173,20 @@ def download_url(url: str, timeout: int=None) -> str: return response.text +@enforce_types +def get_headers(url: str, timeout: int=None) -> str: + """Download the contents of a remote url and return the headers""" + from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT + timeout = timeout or TIMEOUT + response = requests.get( + url, + headers={'User-Agent': WGET_USER_AGENT}, + verify=CHECK_SSL_VALIDITY, + timeout=timeout, + ) + + return pyjson.dumps(dict(response.headers)) + @enforce_types def chrome_args(**options) -> List[str]: