ArchiveBox/archivebox/extractors/wget.py

__package__ = 'archivebox.extractors'

import re
from pathlib import Path

from typing import Optional
from datetime import datetime, timezone

from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
    enforce_types,
    without_fragment,
    without_query,
    path,
    domain,
    urldecode,
)
from ..config import (
    WGET_ARGS,
    TIMEOUT,
    SAVE_WGET,
    SAVE_WARC,
    WGET_BINARY,
    WGET_VERSION,
    RESTRICT_FILE_NAMES,
    CHECK_SSL_VALIDITY,
    SAVE_WGET_REQUISITES,
    WGET_AUTO_COMPRESSION,
    WGET_USER_AGENT,
    COOKIES_FILE,
)
from ..logging_util import TimedProgress


@enforce_types
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
    output_path = wget_output_path(link)
    out_dir = out_dir or Path(link.link_dir)
    if not overwrite and output_path and (out_dir / output_path).exists():
        return False

    return SAVE_WGET


@enforce_types
def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using wget"""

    out_dir = out_dir or link.link_dir
    if SAVE_WARC:
        warc_dir = out_dir / "warc"
        warc_dir.mkdir(exist_ok=True)
        warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output: ArchiveOutput = None
    cmd = [
        WGET_BINARY,
        # '--server-response',  # print headers for better error parsing
        *WGET_ARGS,
        '--timeout={}'.format(timeout),
        *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
        *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
        *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
        *([] if SAVE_WARC else ['--timestamping']),
        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
        link.url,
    ]

    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, cwd=str(out_dir), timeout=timeout)
        output = wget_output_path(link)

        # parse out number of files downloaded from last line of stderr:
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
            if line.strip()
        ]
        files_downloaded = (
            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
            if 'Downloaded:' in output_tail[-1]
            else 0
        )
        hints = (
            'Got wget response code: {}.'.format(result.returncode),
            *output_tail,
        )

        # Check for common failure cases
        if (result.returncode > 0 and files_downloaded < 1) or output is None:
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Wget failed or got an error from the server', hints)
        
        if (out_dir / output).exists():
            chmod_file(output, cwd=str(out_dir))
        else:
            print(f'          {out_dir}/{output}')
            raise ArchiveError('Failed to find wget output after running', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=str(out_dir),
        cmd_version=WGET_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )


@enforce_types
def wget_output_path(link: Link) -> Optional[str]:
    """calculate the path to the wgetted .html file, since wget may
    adjust some paths to be different than the base_url path.

    See docs on wget --adjust-extension (-E)
    """
    
    # Wget downloads can save in a number of different ways depending on the url:
    #    https://example.com
    #       > example.com/index.html
    #    https://example.com?v=zzVa_tX1OiI
    #       > example.com/index.html?v=zzVa_tX1OiI.html
    #    https://www.example.com/?v=zzVa_tX1OiI
    #       > example.com/index.html?v=zzVa_tX1OiI.html

    #    https://example.com/abc
    #       > example.com/abc.html
    #    https://example.com/abc/
    #       > example.com/abc/index.html
    #    https://example.com/abc?v=zzVa_tX1OiI.html
    #       > example.com/abc?v=zzVa_tX1OiI.html
    #    https://example.com/abc/?v=zzVa_tX1OiI.html
    #       > example.com/abc/index.html?v=zzVa_tX1OiI.html

    #    https://example.com/abc/test.html
    #       > example.com/abc/test.html
    #    https://example.com/abc/test?v=zzVa_tX1OiI
    #       > example.com/abc/test?v=zzVa_tX1OiI.html
    #    https://example.com/abc/test/?v=zzVa_tX1OiI
    #       > example.com/abc/test/index.html?v=zzVa_tX1OiI.html

    # There's also lots of complexity around how the urlencoding and renaming
    # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc

    # Since the wget algorithm for -E (appending .html) is incredibly complex
    # and there's no way to get the computed output path from wget
    # in order to avoid having to reverse-engineer how they calculate it,
    # we just look in the output folder read the filename wget used from the filesystem
    full_path = without_fragment(without_query(path(link.url))).strip('/')
    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
    for _ in range(4):
        if search_dir.exists():
            if search_dir.is_dir():
                html_files = [
                    f for f in search_dir.iterdir()
                    if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
                ]
                if html_files:
                    return str(html_files[0].relative_to(link.link_dir))

                # sometimes wget'd URLs have no ext and return non-html
                # e.g. /some/example/rss/all -> some RSS XML content)
                #      /some/other/url.o4g   -> some binary unrecognized ext)
                # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
                last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
                for file_present in search_dir.iterdir():
                    if file_present == last_part_of_url:
                        return str((search_dir / file_present).relative_to(link.link_dir))

        # Move up one directory level
        search_dir = search_dir.parent

        if str(search_dir) == link.link_dir:
            break

    # check for literally any file present that isnt an empty folder
    domain_dir = Path(domain(link.url).replace(":", "+"))
    files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
    if files_within:
        return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
    
    # fallback to just the domain dir
    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
    if search_dir.is_dir():
        return domain(link.url).replace(":", "+")

    return None
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`__package__ = 'archivebox.extractors'`

split up utils into separate files 2019-05-01 03:13:04 +00:00			`import re`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`from pathlib import Path`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`from typing import Optional`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`from datetime import datetime, timezone`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
split up utils into separate files 2019-05-01 03:13:04 +00:00			`from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError`
fix missing imports 2020-06-30 09:55:34 +00:00			`from ..system import run, chmod_file`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`from ..util import (`
			`enforce_types,`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`without_fragment,`
			`without_query,`
			`path,`
			`domain,`
			`urldecode,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`)`
			`from ..config import (`
feat: Add WGET_ARGS to control wget arguments 2020-10-15 13:31:49 +00:00			`WGET_ARGS,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`TIMEOUT,`
			`SAVE_WGET,`
			`SAVE_WARC,`
			`WGET_BINARY,`
			`WGET_VERSION,`
Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`RESTRICT_FILE_NAMES,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`CHECK_SSL_VALIDITY,`
			`SAVE_WGET_REQUISITES,`
			`WGET_AUTO_COMPRESSION,`
			`WGET_USER_AGENT,`
			`COOKIES_FILE,`
			`)`
fix: Rename logging folder to avoid naming conflicts (and circular import issues) 2020-07-22 16:02:13 +00:00			`from ..logging_util import TimedProgress`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00

			`@enforce_types`
Refactor `should_save_extractor` methods to accept `overwrite` parameter 2021-01-21 21:45:11 +00:00			`def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`output_path = wget_output_path(link)`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`out_dir = out_dir or Path(link.link_dir)`
Refactor `should_save_extractor` methods to accept `overwrite` parameter 2021-01-21 21:45:11 +00:00			`if not overwrite and output_path and (out_dir / output_path).exists():`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`return False`

			`return SAVE_WGET`


			`@enforce_types`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`"""download full site using wget"""`

			`out_dir = out_dir or link.link_dir`
			`if SAVE_WARC:`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`warc_dir = out_dir / "warc"`
			`warc_dir.mkdir(exist_ok=True)`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 08:19:30 +00:00			`warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html`
			`output: ArchiveOutput = None`
			`cmd = [`
			`WGET_BINARY,`
			`# '--server-response', # print headers for better error parsing`
feat: Add WGET_ARGS to control wget arguments 2020-10-15 13:31:49 +00:00			`*WGET_ARGS,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`'--timeout={}'.format(timeout),`
Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),`
			`*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),`
fix cookies file arg is path 2021-01-21 00:13:53 +00:00			`*(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),`
Merge branch 'master' into django 2020-06-26 01:30:29 +00:00			`*([] if SAVE_WARC else ['--timestamping']),`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),`
			`link.url,`
			`]`
fix config file atomic writing bugs 2020-06-30 06:04:16 +00:00
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`status = 'succeeded'`
			`timer = TimedProgress(timeout, prefix=' ')`
			`try:`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`result = run(cmd, cwd=str(out_dir), timeout=timeout)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`output = wget_output_path(link)`

			`# parse out number of files downloaded from last line of stderr:`
			`# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"`
			`output_tail = [`
			`line.strip()`
			`for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]`
			`if line.strip()`
			`]`
			`files_downloaded = (`
			`int(output_tail[-1].strip().split(' ', 2)[1] or 0)`
			`if 'Downloaded:' in output_tail[-1]`
			`else 0`
			`)`
fix archive_org header rename 2020-07-22 05:46:38 +00:00			`hints = (`
			`'Got wget response code: {}.'.format(result.returncode),`
			`*output_tail,`
			`)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00
			`# Check for common failure cases`
fix archive_org header rename 2020-07-22 05:46:38 +00:00			`if (result.returncode > 0 and files_downloaded < 1) or output is None:`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`if b'403: Forbidden' in result.stderr:`
			`raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)`
			`if b'404: Not Found' in result.stderr:`
			`raise ArchiveError('404 Not Found', hints)`
			`if b'ERROR 500: Internal Server Error' in result.stderr:`
			`raise ArchiveError('500 Internal Server Error', hints)`
fix archive_org header rename 2020-07-22 05:46:38 +00:00			`raise ArchiveError('Wget failed or got an error from the server', hints)`
only chmod wget output if it exists 2021-01-31 03:02:11 +00:00
			`if (out_dir / output).exists():`
			`chmod_file(output, cwd=str(out_dir))`
			`else:`
			`print(f' {out_dir}/{output}')`
			`raise ArchiveError('Failed to find wget output after running', hints)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`except Exception as err:`
			`status = 'failed'`
			`output = err`
			`finally:`
			`timer.end()`

			`return ArchiveResult(`
			`cmd=cmd,`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`pwd=str(out_dir),`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`cmd_version=WGET_VERSION,`
			`output=output,`
			`status=status,`
			`**timer.stats,`
			`)`
split up utils into separate files 2019-05-01 03:13:04 +00:00

			`@enforce_types`
			`def wget_output_path(link: Link) -> Optional[str]:`
			`"""calculate the path to the wgetted .html file, since wget may`
			`adjust some paths to be different than the base_url path.`

			`See docs on wget --adjust-extension (-E)`
			`"""`
use globbing to find wget output path 2021-01-31 03:02:39 +00:00
split up utils into separate files 2019-05-01 03:13:04 +00:00			`# Wget downloads can save in a number of different ways depending on the url:`
			`# https://example.com`
			`# > example.com/index.html`
			`# https://example.com?v=zzVa_tX1OiI`
			`# > example.com/index.html?v=zzVa_tX1OiI.html`
			`# https://www.example.com/?v=zzVa_tX1OiI`
			`# > example.com/index.html?v=zzVa_tX1OiI.html`

			`# https://example.com/abc`
			`# > example.com/abc.html`
			`# https://example.com/abc/`
			`# > example.com/abc/index.html`
			`# https://example.com/abc?v=zzVa_tX1OiI.html`
			`# > example.com/abc?v=zzVa_tX1OiI.html`
			`# https://example.com/abc/?v=zzVa_tX1OiI.html`
			`# > example.com/abc/index.html?v=zzVa_tX1OiI.html`

			`# https://example.com/abc/test.html`
			`# > example.com/abc/test.html`
			`# https://example.com/abc/test?v=zzVa_tX1OiI`
			`# > example.com/abc/test?v=zzVa_tX1OiI.html`
			`# https://example.com/abc/test/?v=zzVa_tX1OiI`
			`# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html`

			`# There's also lots of complexity around how the urlencoding and renaming`
			`# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc`

			`# Since the wget algorithm for -E (appending .html) is incredibly complex`
			`# and there's no way to get the computed output path from wget`
			`# in order to avoid having to reverse-engineer how they calculate it,`
			`# we just look in the output folder read the filename wget used from the filesystem`
			`full_path = without_fragment(without_query(path(link.url))).strip('/')`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`for _ in range(4):`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`if search_dir.exists():`
			`if search_dir.is_dir():`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`html_files = [`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`f for f in search_dir.iterdir()`
			`if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I \| re.M)`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`]`
			`if html_files:`
fix: Wget absolute path generating issues 2020-09-25 13:13:28 +00:00			`return str(html_files[0].relative_to(link.link_dir))`
split up utils into separate files 2019-05-01 03:13:04 +00:00
check for non html files from wget 2021-01-22 19:06:01 +00:00			`# sometimes wget'd URLs have no ext and return non-html`
			`# e.g. /some/example/rss/all -> some RSS XML content)`
			`# /some/other/url.o4g -> some binary unrecognized ext)`
			`# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all`
			`last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])`
fix using os.path calls on pathlib paths 2021-01-27 16:27:40 +00:00			`for file_present in search_dir.iterdir():`
check for non html files from wget 2021-01-22 19:06:01 +00:00			`if file_present == last_part_of_url:`
use globbing to find wget output path 2021-01-31 03:02:39 +00:00			`return str((search_dir / file_present).relative_to(link.link_dir))`
check for non html files from wget 2021-01-22 19:06:01 +00:00
split up utils into separate files 2019-05-01 03:13:04 +00:00			`# Move up one directory level`
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`search_dir = search_dir.parent`
split up utils into separate files 2019-05-01 03:13:04 +00:00
test: Fix tests post-rebase 2020-09-15 19:05:48 +00:00			`if str(search_dir) == link.link_dir:`
split up utils into separate files 2019-05-01 03:13:04 +00:00			`break`
check for non html files from wget 2021-01-22 19:06:01 +00:00
fix alerts 2021-02-01 07:22:02 +00:00			`# check for literally any file present that isnt an empty folder`
use globbing to find wget output path 2021-01-31 03:02:39 +00:00			`domain_dir = Path(domain(link.url).replace(":", "+"))`
			`files_within = list((Path(link.link_dir) / domain_dir).glob('*/.*'))`
			`if files_within:`
			`return str((domain_dir / files_within[-1]).relative_to(link.link_dir))`
fix: wget_output_path failing on some extractors. Add a new condition 2021-01-07 14:07:29 +00:00
use globbing to find wget output path 2021-01-31 03:02:39 +00:00			`# fallback to just the domain dir`
			`search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")`
			`if search_dir.is_dir():`
			`return domain(link.url).replace(":", "+")`
split up utils into separate files 2019-05-01 03:13:04 +00:00
			`return None`