mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 06:00:22 +00:00
bump timeouts and improve curl archive method
This commit is contained in:
parent
370adb56bf
commit
7ea36c4adb
2 changed files with 18 additions and 8 deletions
|
@ -214,6 +214,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
|||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'--restrict-file-names=unix',
|
||||
f'--timeout={timeout}',
|
||||
*(('--warc-file={}'.format(warc_path),) if warc else ()),
|
||||
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
||||
*(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
||||
|
@ -222,7 +223,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
|||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # index.html
|
||||
end()
|
||||
output = wget_output_path(link, look_in=domain_dir)
|
||||
|
||||
|
@ -265,13 +266,13 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
|||
*chrome_headless(user_data_dir=user_data_dir),
|
||||
'--print-to-pdf',
|
||||
'--hide-scrollbars',
|
||||
'--timeout=58000',
|
||||
'--timeout={timeout * 1000}',
|
||||
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
||||
link['url']
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.pdf
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', (result.stderr or result.stdout).decode())
|
||||
|
@ -304,14 +305,14 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
|
|||
'--screenshot',
|
||||
'--window-size={}'.format(resolution),
|
||||
'--hide-scrollbars',
|
||||
'--timeout=58000',
|
||||
'--timeout={timeout * 1000}',
|
||||
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
||||
# '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
|
||||
link['url'],
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # sreenshot.png
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', (result.stderr or result.stdout).decode())
|
||||
|
@ -344,12 +345,13 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
|||
CMD = [
|
||||
*chrome_headless(user_data_dir=user_data_dir),
|
||||
'--dump-dom',
|
||||
'--timeout={timeout * 1000}',
|
||||
link['url']
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
with open(output_path, 'w+') as f:
|
||||
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html
|
||||
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.html
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', (result.stderr).decode())
|
||||
|
@ -379,7 +381,15 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
|||
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
||||
|
||||
success = False
|
||||
CMD = ['curl', '-L', '-I', '-X', 'GET', submit_url]
|
||||
CMD = [
|
||||
'curl',
|
||||
'--location',
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
'--get',
|
||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||
submit_url,
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
|
||||
|
|
|
@ -123,7 +123,7 @@ def progress(seconds=TIMEOUT, prefix=''):
|
|||
chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
|
||||
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
||||
|
||||
def progress_bar(seconds=seconds, prefix=prefix):
|
||||
def progress_bar(seconds, prefix):
|
||||
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
||||
try:
|
||||
for s in range(seconds * chunks):
|
||||
|
|
Loading…
Reference in a new issue