move dependency checking into config file

This commit is contained in:
Nick Sweeting 2019-03-22 22:05:45 -04:00
parent 096832210c
commit 4c499d77b6
4 changed files with 144 additions and 136 deletions

View file

@ -22,7 +22,6 @@ from config import (
GIT_SHA,
)
from util import (
check_dependencies,
save_remote_source,
save_stdin_source,
)
@ -33,7 +32,7 @@ from logs import (
)
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__VERSION__ = GIT_SHA
__VERSION__ = GIT_SHA[:9]
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
@ -42,11 +41,13 @@ def print_help():
print('ArchiveBox: The self-hosted internet archive.\n')
print("Documentation:")
print(" https://github.com/pirate/ArchiveBox/wiki\n")
print("Usage:")
print(" echo 'https://examplecom' | ./bin/archivebox\n")
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
print(" ./bin/archivebox https://example.com/feed.rss\n")
print(" ./bin/archivebox 15109948213.123\n")
print("UI Usage:")
print(" Open output/index.html to view your archive.\n")
print("CLI Usage:")
print(" echo 'https://example.com' | ./archive\n")
print(" ./archive ~/Downloads/bookmarks_export.html\n")
print(" ./archive https://example.com/feed.rss\n")
print(" ./archive 15109948213.123\n")
def main(*args):
@ -54,6 +55,10 @@ def main(*args):
print_help()
raise SystemExit(0)
if set(args).intersection(('--version', 'version')):
print('ArchiveBox version {}'.format(__VERSION__))
raise SystemExit(0)
### Handle CLI arguments
# ./archive bookmarks.html
# ./archive 1523422111.234
@ -95,7 +100,6 @@ def main(*args):
def update_archive_data(import_path=None, resume=None):
"""The main ArchiveBox entrancepoint. Everything starts here."""
check_dependencies()
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path

View file

@ -297,7 +297,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT):
output = 'output.pdf'
cmd = [
*chrome_args(timeout=timeout),
*chrome_args(TIMEOUT=timeout),
'--print-to-pdf',
link['url'],
]
@ -339,7 +339,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
output = 'screenshot.png'
cmd = [
*chrome_args(timeout=timeout),
*chrome_args(TIMEOUT=timeout),
'--screenshot',
link['url'],
]
@ -382,7 +382,7 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT):
output = 'output.html'
output_path = os.path.join(link_dir, output)
cmd = [
*chrome_args(timeout=timeout),
*chrome_args(TIMEOUT=timeout),
'--dump-dom',
link['url']
]

View file

@ -1,8 +1,9 @@
import os
import re
import sys
import shutil
from subprocess import run, PIPE
from subprocess import run, PIPE, DEVNULL
# ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
@ -68,7 +69,6 @@ SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox')
TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
@ -137,44 +137,115 @@ if not USE_COLOR:
# dont show colors if USE_COLOR is False
ANSI = {k: '' for k in ANSI.keys()}
### Confirm Environment Setup
GIT_SHA = 'unknown'
try:
GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
except Exception:
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
CHROME_VERSION = 'unknown'
try:
chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
except Exception:
if USE_CHROME:
print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
WGET_VERSION = 'unknown'
try:
wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
except Exception:
if USE_WGET:
print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
try:
COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
except Exception:
print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
raise
### Check Python environment
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:')
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
print('')
print(' Alternatively, run this script with:')
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:')
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
print('')
print(' Alternatively, run this script with:')
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
### Get code version by parsing git log
GIT_SHA = 'unknown'
try:
GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
except Exception:
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
### Get absolute path for cookies file
try:
COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
except Exception:
print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
raise
### Make sure curl is installed
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
### Make sure wget is installed and calculate version
if FETCH_WGET or FETCH_WARC:
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
WGET_VERSION = 'unknown'
try:
wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
except Exception:
if USE_WGET:
print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
### Make sure chrome is installed and calculate version
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
CHROME_VERSION = 'unknown'
try:
chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
except Exception:
if USE_CHROME:
print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
### Make sure git is installed
if FETCH_GIT:
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
### Make sure youtube-dl is installed
if FETCH_MEDIA:
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except KeyboardInterrupt:
raise SystemExit(1)

View file

@ -25,11 +25,6 @@ from config import (
OUTPUT_PERMISSIONS,
TIMEOUT,
SHOW_PROGRESS,
CURL_BINARY,
WGET_BINARY,
CHROME_BINARY,
GIT_BINARY,
YOUTUBEDL_BINARY,
FETCH_TITLE,
FETCH_FAVICON,
FETCH_WGET,
@ -124,70 +119,6 @@ def check_links_structure(links):
if links:
check_link_structure(links[0])
def check_dependencies():
"""Check that all necessary dependencies are installed, and have valid versions"""
try:
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_WGET or FETCH_WARC:
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_GIT:
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_MEDIA:
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (KeyboardInterrupt, Exception):
raise SystemExit(1)
def check_url_parsing_invariants():
"""Check that plain text regex URL parsing works as expected"""
@ -284,7 +215,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
match = re.search(HTML_TITLE_REGEX, html)
return match.group(1).strip() if match else None
except Exception as err:
except Exception as err: # noqa
# print('[!] Failed to fetch title because of {}: {}'.format(
# err.__class__.__name__,
# err,
@ -560,10 +491,13 @@ def progress_bar(seconds, prefix):
pass
class TimedProgress:
"""Show a progress bar and measure elapsed time until .end() is called"""
def __init__(self, seconds, prefix=''):
if SHOW_PROGRESS:
self.p = Process(target=progress_bar, args=(seconds, prefix))
self.p.start()
self.stats = {
'start_ts': datetime.now(),
'end_ts': None,
@ -571,7 +505,7 @@ class TimedProgress:
}
def end(self):
"""immediately finish progress and clear the progressbar line"""
"""immediately end progress, clear the progressbar line, and save end_ts"""
end_ts = datetime.now()
self.stats.update({
@ -591,6 +525,8 @@ class TimedProgress:
sys.stdout.flush()
def download_url(url, timeout=TIMEOUT):
"""Download the contents of a remote url and return the text"""
req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
if CHECK_SSL_VALIDITY:
@ -615,34 +551,31 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
resolution=RESOLUTION, timeout=TIMEOUT):
def chrome_args(**options):
"""helper to build up a chrome shell command with arguments"""
cmd_args = [binary]
cmd_args = [options['CHROME_BINARY']]
if headless:
if options['HEADLESS']:
cmd_args += ('--headless',)
if not sandbox:
if not options['CHROME_SANDBOX']:
# dont use GPU or sandbox when running inside docker container
cmd_args += ('--no-sandbox', '--disable-gpu')
if not check_ssl_validity:
if not options['CHECK_SSL_VALIDITY']:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if user_agent:
cmd_args += ('--user-agent={}'.format(user_agent),)
if options['CHROME_USER_AGENT']:
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
if resolution:
cmd_args += ('--window-size={}'.format(RESOLUTION),)
if options['RESOLUTION']:
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if timeout:
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
if options['TIMEOUT']:
cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)
if user_data_dir:
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
return cmd_args