mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 22:54:22 +00:00
bump timeouts and improve curl archive method
This commit is contained in:
parent
370adb56bf
commit
7ea36c4adb
2 changed files with 18 additions and 8 deletions
|
@ -214,6 +214,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
'--span-hosts',
|
'--span-hosts',
|
||||||
'--no-parent',
|
'--no-parent',
|
||||||
'--restrict-file-names=unix',
|
'--restrict-file-names=unix',
|
||||||
|
f'--timeout={timeout}',
|
||||||
*(('--warc-file={}'.format(warc_path),) if warc else ()),
|
*(('--warc-file={}'.format(warc_path),) if warc else ()),
|
||||||
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
||||||
*(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
*(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
||||||
|
@ -222,7 +223,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # index.html
|
||||||
end()
|
end()
|
||||||
output = wget_output_path(link, look_in=domain_dir)
|
output = wget_output_path(link, look_in=domain_dir)
|
||||||
|
|
||||||
|
@ -265,13 +266,13 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
||||||
*chrome_headless(user_data_dir=user_data_dir),
|
*chrome_headless(user_data_dir=user_data_dir),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
'--hide-scrollbars',
|
'--hide-scrollbars',
|
||||||
'--timeout=58000',
|
'--timeout={timeout * 1000}',
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.pdf
|
||||||
end()
|
end()
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
print(' ', (result.stderr or result.stdout).decode())
|
print(' ', (result.stderr or result.stdout).decode())
|
||||||
|
@ -304,14 +305,14 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
'--window-size={}'.format(resolution),
|
'--window-size={}'.format(resolution),
|
||||||
'--hide-scrollbars',
|
'--hide-scrollbars',
|
||||||
'--timeout=58000',
|
'--timeout={timeout * 1000}',
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
||||||
# '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
|
# '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
|
||||||
link['url'],
|
link['url'],
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # sreenshot.png
|
||||||
end()
|
end()
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
print(' ', (result.stderr or result.stdout).decode())
|
print(' ', (result.stderr or result.stdout).decode())
|
||||||
|
@ -344,12 +345,13 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
||||||
CMD = [
|
CMD = [
|
||||||
*chrome_headless(user_data_dir=user_data_dir),
|
*chrome_headless(user_data_dir=user_data_dir),
|
||||||
'--dump-dom',
|
'--dump-dom',
|
||||||
|
'--timeout={timeout * 1000}',
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
with open(output_path, 'w+') as f:
|
with open(output_path, 'w+') as f:
|
||||||
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html
|
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.html
|
||||||
end()
|
end()
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
print(' ', (result.stderr).decode())
|
print(' ', (result.stderr).decode())
|
||||||
|
@ -379,7 +381,15 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
||||||
|
|
||||||
success = False
|
success = False
|
||||||
CMD = ['curl', '-L', '-I', '-X', 'GET', submit_url]
|
CMD = [
|
||||||
|
'curl',
|
||||||
|
'--location',
|
||||||
|
'--head',
|
||||||
|
'--max-time', str(timeout),
|
||||||
|
'--get',
|
||||||
|
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||||
|
submit_url,
|
||||||
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
|
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
|
||||||
|
|
|
@ -123,7 +123,7 @@ def progress(seconds=TIMEOUT, prefix=''):
|
||||||
chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
|
chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
|
||||||
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
||||||
|
|
||||||
def progress_bar(seconds=seconds, prefix=prefix):
|
def progress_bar(seconds, prefix):
|
||||||
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
||||||
try:
|
try:
|
||||||
for s in range(seconds * chunks):
|
for s in range(seconds * chunks):
|
||||||
|
|
Loading…
Reference in a new issue