bump timeouts and improve curl archive method

This commit is contained in:
Nick Sweeting 2019-01-20 12:34:15 -05:00
parent 370adb56bf
commit 7ea36c4adb
2 changed files with 18 additions and 8 deletions

View file

@ -214,6 +214,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
'--span-hosts', '--span-hosts',
'--no-parent', '--no-parent',
'--restrict-file-names=unix', '--restrict-file-names=unix',
f'--timeout={timeout}',
*(('--warc-file={}'.format(warc_path),) if warc else ()), *(('--warc-file={}'.format(warc_path),) if warc else ()),
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
*(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
@ -222,7 +223,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
] ]
end = progress(timeout, prefix=' ') end = progress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # index.html
end() end()
output = wget_output_path(link, look_in=domain_dir) output = wget_output_path(link, look_in=domain_dir)
@ -265,13 +266,13 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
*chrome_headless(user_data_dir=user_data_dir), *chrome_headless(user_data_dir=user_data_dir),
'--print-to-pdf', '--print-to-pdf',
'--hide-scrollbars', '--hide-scrollbars',
'--timeout=58000', '--timeout={timeout * 1000}',
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
link['url'] link['url']
] ]
end = progress(timeout, prefix=' ') end = progress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.pdf
end() end()
if result.returncode: if result.returncode:
print(' ', (result.stderr or result.stdout).decode()) print(' ', (result.stderr or result.stdout).decode())
@ -304,14 +305,14 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
'--screenshot', '--screenshot',
'--window-size={}'.format(resolution), '--window-size={}'.format(resolution),
'--hide-scrollbars', '--hide-scrollbars',
'--timeout=58000', '--timeout={timeout * 1000}',
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
# '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
link['url'], link['url'],
] ]
end = progress(timeout, prefix=' ') end = progress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # sreenshot.png
end() end()
if result.returncode: if result.returncode:
print(' ', (result.stderr or result.stdout).decode()) print(' ', (result.stderr or result.stdout).decode())
@ -344,12 +345,13 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
CMD = [ CMD = [
*chrome_headless(user_data_dir=user_data_dir), *chrome_headless(user_data_dir=user_data_dir),
'--dump-dom', '--dump-dom',
'--timeout={timeout * 1000}',
link['url'] link['url']
] ]
end = progress(timeout, prefix=' ') end = progress(timeout, prefix=' ')
try: try:
with open(output_path, 'w+') as f: with open(output_path, 'w+') as f:
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.html
end() end()
if result.returncode: if result.returncode:
print(' ', (result.stderr).decode()) print(' ', (result.stderr).decode())
@ -379,7 +381,15 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
submit_url = 'https://web.archive.org/save/{}'.format(link['url']) submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
success = False success = False
CMD = ['curl', '-L', '-I', '-X', 'GET', submit_url] CMD = [
'curl',
'--location',
'--head',
'--max-time', str(timeout),
'--get',
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
submit_url,
]
end = progress(timeout, prefix=' ') end = progress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt

View file

@ -123,7 +123,7 @@ def progress(seconds=TIMEOUT, prefix=''):
chunk = '' if sys.stdout.encoding == 'UTF-8' else '#' chunk = '' if sys.stdout.encoding == 'UTF-8' else '#'
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
def progress_bar(seconds=seconds, prefix=prefix): def progress_bar(seconds, prefix):
"""show timer in the form of progress bar, with percentage and seconds remaining""" """show timer in the form of progress bar, with percentage and seconds remaining"""
try: try:
for s in range(seconds * chunks): for s in range(seconds * chunks):