mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 20:23:12 +00:00
nicer error formats
This commit is contained in:
parent
de6eb649e0
commit
071b39b944
1 changed files with 40 additions and 24 deletions
|
@ -224,27 +224,27 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # index.html
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # index.html
|
||||||
end()
|
end()
|
||||||
output = wget_output_path(link, look_in=domain_dir)
|
output = wget_output_path(link, look_in=domain_dir)
|
||||||
|
|
||||||
# Check for common failure cases
|
# Check for common failure cases
|
||||||
if result.returncode > 0:
|
if result.returncode > 0:
|
||||||
print(' got wget response code {}:'.format(result.returncode))
|
print(' Got wget response code {}:'.format(result.returncode))
|
||||||
if result.returncode != 8:
|
print('\n'.join(' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()))
|
||||||
print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
|
|
||||||
if b'403: Forbidden' in result.stderr:
|
if b'403: Forbidden' in result.stderr:
|
||||||
raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
|
raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
|
||||||
if b'404: Not Found' in result.stderr:
|
if b'404: Not Found' in result.stderr:
|
||||||
raise Exception('404 Not Found')
|
raise Exception('404 Not Found')
|
||||||
if b'ERROR 500: Internal Server Error' in result.stderr:
|
if b'ERROR 500: Internal Server Error' in result.stderr:
|
||||||
raise Exception('500 Internal Server Error')
|
raise Exception('500 Internal Server Error')
|
||||||
if result.returncode == 4:
|
raise Exception('Got an error from the server')
|
||||||
raise Exception('Failed wget download')
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
print(' {}Some resources were skipped: {}{}'.format(ANSI['lightyellow'], e, ANSI['reset']))
|
||||||
print(' {}Warning: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' Run to see full output:')
|
||||||
|
print(' cd {};'.format(link_dir))
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -267,13 +267,13 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
||||||
*chrome_headless(user_data_dir=user_data_dir),
|
*chrome_headless(user_data_dir=user_data_dir),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
'--hide-scrollbars',
|
'--hide-scrollbars',
|
||||||
'--timeout={timeout * 1000}',
|
'--timeout={}'.format((timeout) * 1000),
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.pdf
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.pdf
|
||||||
end()
|
end()
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
print(' ', (result.stderr or result.stdout).decode())
|
print(' ', (result.stderr or result.stdout).decode())
|
||||||
|
@ -282,8 +282,10 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
||||||
output = 'output.pdf'
|
output = 'output.pdf'
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
print(' Run to see full output:')
|
||||||
|
print(' cd {};'.format(link_dir))
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -306,14 +308,14 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
'--window-size={}'.format(resolution),
|
'--window-size={}'.format(resolution),
|
||||||
'--hide-scrollbars',
|
'--hide-scrollbars',
|
||||||
'--timeout={timeout * 1000}',
|
'--timeout={}'.format((timeout) * 1000),
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
*(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
|
||||||
# '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
|
# '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
|
||||||
link['url'],
|
link['url'],
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # sreenshot.png
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # sreenshot.png
|
||||||
end()
|
end()
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
print(' ', (result.stderr or result.stdout).decode())
|
print(' ', (result.stderr or result.stdout).decode())
|
||||||
|
@ -322,8 +324,10 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
|
||||||
output = 'screenshot.png'
|
output = 'screenshot.png'
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
print(' Run to see full output:')
|
||||||
|
print(' cd {};'.format(link_dir))
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -346,13 +350,13 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
||||||
CMD = [
|
CMD = [
|
||||||
*chrome_headless(user_data_dir=user_data_dir),
|
*chrome_headless(user_data_dir=user_data_dir),
|
||||||
'--dump-dom',
|
'--dump-dom',
|
||||||
'--timeout={timeout * 1000}',
|
'--timeout={}'.format((timeout) * 1000),
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
with open(output_path, 'w+') as f:
|
with open(output_path, 'w+') as f:
|
||||||
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 5) # output.html
|
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.html
|
||||||
end()
|
end()
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
print(' ', (result.stderr).decode())
|
print(' ', (result.stderr).decode())
|
||||||
|
@ -361,8 +365,10 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
||||||
output = 'output.html'
|
output = 'output.html'
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
print(' Run to see full output:')
|
||||||
|
print(' cd {};'.format(link_dir))
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -393,7 +399,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
|
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # archive.org.txt
|
||||||
end()
|
end()
|
||||||
|
|
||||||
# Parse archive.org response headers
|
# Parse archive.org response headers
|
||||||
|
@ -422,8 +428,9 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
raise Exception('Failed to find "content-location" URL header in Archive.org response.')
|
raise Exception('Failed to find "content-location" URL header in Archive.org response.')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Visit url to see output:', ' '.join(CMD))
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
print(' Run to see full output:')
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
|
@ -444,11 +451,15 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||||
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
|
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
|
||||||
return {'output': 'favicon.ico', 'status': 'skipped'}
|
return {'output': 'favicon.ico', 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
|
CMD = [
|
||||||
|
'curl',
|
||||||
|
'--max-time', str(timeout),
|
||||||
|
'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
|
||||||
|
]
|
||||||
fout = open('{}/favicon.ico'.format(link_dir), 'w')
|
fout = open('{}/favicon.ico'.format(link_dir), 'w')
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # favicon.ico
|
run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # favicon.ico
|
||||||
fout.close()
|
fout.close()
|
||||||
end()
|
end()
|
||||||
chmod_file('favicon.ico', cwd=link_dir)
|
chmod_file('favicon.ico', cwd=link_dir)
|
||||||
|
@ -456,8 +467,9 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
fout.close()
|
fout.close()
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', ' '.join(CMD))
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
print(' Run to see full output:')
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -510,8 +522,10 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
||||||
raise Exception('Failed to download media')
|
raise Exception('Failed to download media')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
print(' Run to see full output:')
|
||||||
|
print(' cd {};'.format(link_dir))
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -545,8 +559,10 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
|
||||||
raise Exception('Failed git download')
|
raise Exception('Failed git download')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
print(' Run to see full output:')
|
||||||
|
print(' cd {};'.format(link_dir))
|
||||||
|
print(' {}'.format(' '.join(CMD)))
|
||||||
output = e
|
output = e
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
Loading…
Reference in a new issue