mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
Add EXTRA_*_ARGS
for wget, curl, and singlefile
This commit is contained in:
parent
31d05d8526
commit
4e69d2c9e1
8 changed files with 88 additions and 35 deletions
|
@ -187,12 +187,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]},
|
||||
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
||||
'--location',
|
||||
'--compressed'
|
||||
]},
|
||||
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default' : None},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
||||
},
|
||||
|
||||
|
@ -530,6 +533,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
||||
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
||||
|
||||
|
@ -540,12 +544,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||
|
||||
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
|
||||
|
||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
|
|
|
@ -10,10 +10,12 @@ from ..system import run, chmod_file
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
CURL_BINARY,
|
||||
|
@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
|||
output: ArchiveOutput = 'archive.org.txt'
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*CURL_ARGS,
|
||||
# earlier options take precedence
|
||||
options = [
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*CURL_EXTRA_ARGS,
|
||||
*CURL_ARGS,
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(*options),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -6,13 +6,18 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..system import chmod_file, run
|
||||
from ..util import enforce_types, domain
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
domain,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_FAVICON,
|
||||
FAVICON_PROVIDER,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
CURL_USER_AGENT,
|
||||
|
@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'favicon.ico'
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*CURL_ARGS,
|
||||
# earlier options take precedence
|
||||
options = [
|
||||
'--max-time', str(timeout),
|
||||
'--output', str(output),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*CURL_EXTRA_ARGS,
|
||||
*CURL_ARGS,
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(*options),
|
||||
FAVICON_PROVIDER.format(domain(link.url)),
|
||||
]
|
||||
status = 'failed'
|
||||
|
|
|
@ -9,11 +9,13 @@ from ..system import atomic_write
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
get_headers,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_USER_AGENT,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
|
@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*CURL_ARGS,
|
||||
# earlier options take precedence
|
||||
options = [
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*CURL_EXTRA_ARGS,
|
||||
*CURL_ARGS,
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(*options),
|
||||
link.url,
|
||||
]
|
||||
try:
|
||||
|
|
|
@ -11,6 +11,7 @@ from ..util import (
|
|||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -18,6 +19,7 @@ from ..config import (
|
|||
DEPENDENCIES,
|
||||
SINGLEFILE_VERSION,
|
||||
SINGLEFILE_ARGS,
|
||||
SINGLEFILE_EXTRA_ARGS,
|
||||
CHROME_BINARY,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
@ -46,11 +48,6 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||
options = [
|
||||
*SINGLEFILE_ARGS,
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
browser_args,
|
||||
]
|
||||
|
||||
# Deduplicate options (single-file doesn't like when you use the same option two times)
|
||||
#
|
||||
|
@ -58,19 +55,15 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
|
||||
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
|
||||
# kind of like the ergonomic principle of lexical scope in programming languages.
|
||||
seen_option_names = []
|
||||
def test_seen(argument):
|
||||
option_name = argument.split("=")[0]
|
||||
if option_name in seen_option_names:
|
||||
return False
|
||||
else:
|
||||
seen_option_names.append(option_name)
|
||||
return True
|
||||
deduped_options = list(filter(test_seen, options))
|
||||
|
||||
options = [
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
browser_args,
|
||||
*SINGLEFILE_EXTRA_ARGS,
|
||||
*SINGLEFILE_ARGS,
|
||||
]
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
*deduped_options,
|
||||
*dedupe(*options),
|
||||
link.url,
|
||||
output,
|
||||
]
|
||||
|
|
|
@ -10,6 +10,7 @@ from ..util import (
|
|||
enforce_types,
|
||||
download_url,
|
||||
htmldecode,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -17,6 +18,7 @@ from ..config import (
|
|||
SAVE_TITLE,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
|
@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
from core.models import Snapshot
|
||||
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*CURL_ARGS,
|
||||
# earlier options take precedence
|
||||
options = [
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*CURL_EXTRA_ARGS,
|
||||
*CURL_ARGS,
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(*options),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -15,9 +15,11 @@ from ..util import (
|
|||
path,
|
||||
domain,
|
||||
urldecode,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
WGET_ARGS,
|
||||
WGET_EXTRA_ARGS,
|
||||
TIMEOUT,
|
||||
SAVE_WGET,
|
||||
SAVE_WARC,
|
||||
|
@ -55,10 +57,8 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
|
||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
# '--server-response', # print headers for better error parsing
|
||||
*WGET_ARGS,
|
||||
# earlier options take precedence
|
||||
options = [
|
||||
'--timeout={}'.format(timeout),
|
||||
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
||||
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
||||
|
@ -68,6 +68,13 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
# '--server-response', # print headers for better error parsing
|
||||
*WGET_EXTRA_ARGS,
|
||||
*WGET_ARGS,
|
||||
]
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
*dedupe(*options),
|
||||
link.url,
|
||||
]
|
||||
|
||||
|
|
|
@ -317,6 +317,23 @@ def ansi_to_html(text):
|
|||
return COLOR_REGEX.sub(single_sub, text)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def dedupe(*options: List[str]) -> List[str]:
|
||||
"""
|
||||
Deduplicates the given options. Options that come earlier in the list clobber
|
||||
later conflicting options.
|
||||
"""
|
||||
seen_option_names = []
|
||||
def test_seen(argument):
|
||||
option_name = argument.split("=")[0]
|
||||
if option_name in seen_option_names:
|
||||
return False
|
||||
else:
|
||||
seen_option_names.append(option_name)
|
||||
return True
|
||||
return list(filter(test_seen, options))
|
||||
|
||||
|
||||
class AttributeDict(dict):
|
||||
"""Helper to allow accessing dict values via Example.key or Example['key']"""
|
||||
|
||||
|
|
Loading…
Reference in a new issue