mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 21:38:33 +00:00
move dependency checking into config file
This commit is contained in:
parent
096832210c
commit
4c499d77b6
4 changed files with 144 additions and 136 deletions
|
@ -22,7 +22,6 @@ from config import (
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
check_dependencies,
|
|
||||||
save_remote_source,
|
save_remote_source,
|
||||||
save_stdin_source,
|
save_stdin_source,
|
||||||
)
|
)
|
||||||
|
@ -33,7 +32,7 @@ from logs import (
|
||||||
)
|
)
|
||||||
|
|
||||||
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
||||||
__VERSION__ = GIT_SHA
|
__VERSION__ = GIT_SHA[:9]
|
||||||
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||||
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
||||||
|
|
||||||
|
@ -42,11 +41,13 @@ def print_help():
|
||||||
print('ArchiveBox: The self-hosted internet archive.\n')
|
print('ArchiveBox: The self-hosted internet archive.\n')
|
||||||
print("Documentation:")
|
print("Documentation:")
|
||||||
print(" https://github.com/pirate/ArchiveBox/wiki\n")
|
print(" https://github.com/pirate/ArchiveBox/wiki\n")
|
||||||
print("Usage:")
|
print("UI Usage:")
|
||||||
print(" echo 'https://examplecom' | ./bin/archivebox\n")
|
print(" Open output/index.html to view your archive.\n")
|
||||||
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
|
print("CLI Usage:")
|
||||||
print(" ./bin/archivebox https://example.com/feed.rss\n")
|
print(" echo 'https://example.com' | ./archive\n")
|
||||||
print(" ./bin/archivebox 15109948213.123\n")
|
print(" ./archive ~/Downloads/bookmarks_export.html\n")
|
||||||
|
print(" ./archive https://example.com/feed.rss\n")
|
||||||
|
print(" ./archive 15109948213.123\n")
|
||||||
|
|
||||||
|
|
||||||
def main(*args):
|
def main(*args):
|
||||||
|
@ -54,6 +55,10 @@ def main(*args):
|
||||||
print_help()
|
print_help()
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
|
if set(args).intersection(('--version', 'version')):
|
||||||
|
print('ArchiveBox version {}'.format(__VERSION__))
|
||||||
|
raise SystemExit(0)
|
||||||
|
|
||||||
### Handle CLI arguments
|
### Handle CLI arguments
|
||||||
# ./archive bookmarks.html
|
# ./archive bookmarks.html
|
||||||
# ./archive 1523422111.234
|
# ./archive 1523422111.234
|
||||||
|
@ -95,7 +100,6 @@ def main(*args):
|
||||||
|
|
||||||
def update_archive_data(import_path=None, resume=None):
|
def update_archive_data(import_path=None, resume=None):
|
||||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||||
check_dependencies()
|
|
||||||
|
|
||||||
# Step 1: Load list of links from the existing index
|
# Step 1: Load list of links from the existing index
|
||||||
# merge in and dedupe new links from import_path
|
# merge in and dedupe new links from import_path
|
||||||
|
|
|
@ -297,7 +297,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT):
|
||||||
|
|
||||||
output = 'output.pdf'
|
output = 'output.pdf'
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(timeout=timeout),
|
*chrome_args(TIMEOUT=timeout),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
link['url'],
|
link['url'],
|
||||||
]
|
]
|
||||||
|
@ -339,7 +339,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
|
||||||
|
|
||||||
output = 'screenshot.png'
|
output = 'screenshot.png'
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(timeout=timeout),
|
*chrome_args(TIMEOUT=timeout),
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
link['url'],
|
link['url'],
|
||||||
]
|
]
|
||||||
|
@ -382,7 +382,7 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT):
|
||||||
output = 'output.html'
|
output = 'output.html'
|
||||||
output_path = os.path.join(link_dir, output)
|
output_path = os.path.join(link_dir, output)
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(timeout=timeout),
|
*chrome_args(TIMEOUT=timeout),
|
||||||
'--dump-dom',
|
'--dump-dom',
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
from subprocess import run, PIPE
|
from subprocess import run, PIPE, DEVNULL
|
||||||
|
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
|
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
|
||||||
|
@ -68,7 +69,6 @@ SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
|
||||||
PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox')
|
PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox')
|
||||||
TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
|
TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
|
||||||
|
|
||||||
|
|
||||||
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
|
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
|
||||||
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
|
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
|
||||||
USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
|
USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
|
||||||
|
@ -137,44 +137,115 @@ if not USE_COLOR:
|
||||||
# dont show colors if USE_COLOR is False
|
# dont show colors if USE_COLOR is False
|
||||||
ANSI = {k: '' for k in ANSI.keys()}
|
ANSI = {k: '' for k in ANSI.keys()}
|
||||||
|
|
||||||
|
|
||||||
### Confirm Environment Setup
|
### Confirm Environment Setup
|
||||||
GIT_SHA = 'unknown'
|
|
||||||
try:
|
|
||||||
GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
|
||||||
except Exception:
|
|
||||||
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
|
|
||||||
|
|
||||||
CHROME_VERSION = 'unknown'
|
|
||||||
try:
|
|
||||||
chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
|
||||||
CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
|
|
||||||
except Exception:
|
|
||||||
if USE_CHROME:
|
|
||||||
print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
|
|
||||||
|
|
||||||
WGET_VERSION = 'unknown'
|
|
||||||
try:
|
|
||||||
wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
|
||||||
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
|
|
||||||
except Exception:
|
|
||||||
if USE_WGET:
|
|
||||||
print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
|
|
||||||
|
|
||||||
WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
|
### Check Python environment
|
||||||
except Exception:
|
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
||||||
print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
|
if python_vers < 3.5:
|
||||||
raise
|
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
|
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
|
||||||
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
|
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
|
||||||
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
|
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
|
||||||
print('')
|
print('')
|
||||||
print(' Confirm that it\'s fixed by opening a new shell and running:')
|
print(' Confirm that it\'s fixed by opening a new shell and running:')
|
||||||
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
|
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
|
||||||
print('')
|
print('')
|
||||||
print(' Alternatively, run this script with:')
|
print(' Alternatively, run this script with:')
|
||||||
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
|
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
|
||||||
|
|
||||||
|
### Get code version by parsing git log
|
||||||
|
GIT_SHA = 'unknown'
|
||||||
|
try:
|
||||||
|
GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||||
|
except Exception:
|
||||||
|
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
|
||||||
|
|
||||||
|
### Get absolute path for cookies file
|
||||||
|
try:
|
||||||
|
COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
|
||||||
|
except Exception:
|
||||||
|
print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
|
||||||
|
raise
|
||||||
|
|
||||||
|
### Make sure curl is installed
|
||||||
|
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
||||||
|
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
### Make sure wget is installed and calculate version
|
||||||
|
if FETCH_WGET or FETCH_WARC:
|
||||||
|
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
WGET_VERSION = 'unknown'
|
||||||
|
try:
|
||||||
|
wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||||
|
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
|
||||||
|
except Exception:
|
||||||
|
if USE_WGET:
|
||||||
|
print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
|
||||||
|
|
||||||
|
WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
|
||||||
|
|
||||||
|
### Make sure chrome is installed and calculate version
|
||||||
|
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
||||||
|
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
||||||
|
try:
|
||||||
|
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
||||||
|
version_str = result.stdout.decode('utf-8')
|
||||||
|
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
|
||||||
|
version = [l for l in version_lines if l.isdigit()][-1]
|
||||||
|
if int(version) < 59:
|
||||||
|
print(version_lines)
|
||||||
|
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
except (IndexError, TypeError, OSError):
|
||||||
|
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
CHROME_VERSION = 'unknown'
|
||||||
|
try:
|
||||||
|
chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||||
|
CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
|
||||||
|
except Exception:
|
||||||
|
if USE_CHROME:
|
||||||
|
print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
|
||||||
|
|
||||||
|
### Make sure git is installed
|
||||||
|
if FETCH_GIT:
|
||||||
|
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
### Make sure youtube-dl is installed
|
||||||
|
if FETCH_MEDIA:
|
||||||
|
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
|
@ -25,11 +25,6 @@ from config import (
|
||||||
OUTPUT_PERMISSIONS,
|
OUTPUT_PERMISSIONS,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
SHOW_PROGRESS,
|
SHOW_PROGRESS,
|
||||||
CURL_BINARY,
|
|
||||||
WGET_BINARY,
|
|
||||||
CHROME_BINARY,
|
|
||||||
GIT_BINARY,
|
|
||||||
YOUTUBEDL_BINARY,
|
|
||||||
FETCH_TITLE,
|
FETCH_TITLE,
|
||||||
FETCH_FAVICON,
|
FETCH_FAVICON,
|
||||||
FETCH_WGET,
|
FETCH_WGET,
|
||||||
|
@ -124,70 +119,6 @@ def check_links_structure(links):
|
||||||
if links:
|
if links:
|
||||||
check_link_structure(links[0])
|
check_link_structure(links[0])
|
||||||
|
|
||||||
def check_dependencies():
|
|
||||||
"""Check that all necessary dependencies are installed, and have valid versions"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
|
||||||
if python_vers < 3.5:
|
|
||||||
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
|
||||||
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
|
||||||
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_WGET or FETCH_WARC:
|
|
||||||
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
|
||||||
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
|
||||||
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
|
||||||
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
|
||||||
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
|
||||||
try:
|
|
||||||
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
|
||||||
version_str = result.stdout.decode('utf-8')
|
|
||||||
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
|
|
||||||
version = [l for l in version_lines if l.isdigit()][-1]
|
|
||||||
if int(version) < 59:
|
|
||||||
print(version_lines)
|
|
||||||
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
except (IndexError, TypeError, OSError):
|
|
||||||
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
|
||||||
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_GIT:
|
|
||||||
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
|
||||||
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_MEDIA:
|
|
||||||
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
|
||||||
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
except (KeyboardInterrupt, Exception):
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
def check_url_parsing_invariants():
|
def check_url_parsing_invariants():
|
||||||
"""Check that plain text regex URL parsing works as expected"""
|
"""Check that plain text regex URL parsing works as expected"""
|
||||||
|
|
||||||
|
@ -284,7 +215,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
||||||
|
|
||||||
match = re.search(HTML_TITLE_REGEX, html)
|
match = re.search(HTML_TITLE_REGEX, html)
|
||||||
return match.group(1).strip() if match else None
|
return match.group(1).strip() if match else None
|
||||||
except Exception as err:
|
except Exception as err: # noqa
|
||||||
# print('[!] Failed to fetch title because of {}: {}'.format(
|
# print('[!] Failed to fetch title because of {}: {}'.format(
|
||||||
# err.__class__.__name__,
|
# err.__class__.__name__,
|
||||||
# err,
|
# err,
|
||||||
|
@ -560,10 +491,13 @@ def progress_bar(seconds, prefix):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class TimedProgress:
|
class TimedProgress:
|
||||||
|
"""Show a progress bar and measure elapsed time until .end() is called"""
|
||||||
|
|
||||||
def __init__(self, seconds, prefix=''):
|
def __init__(self, seconds, prefix=''):
|
||||||
if SHOW_PROGRESS:
|
if SHOW_PROGRESS:
|
||||||
self.p = Process(target=progress_bar, args=(seconds, prefix))
|
self.p = Process(target=progress_bar, args=(seconds, prefix))
|
||||||
self.p.start()
|
self.p.start()
|
||||||
|
|
||||||
self.stats = {
|
self.stats = {
|
||||||
'start_ts': datetime.now(),
|
'start_ts': datetime.now(),
|
||||||
'end_ts': None,
|
'end_ts': None,
|
||||||
|
@ -571,7 +505,7 @@ class TimedProgress:
|
||||||
}
|
}
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
"""immediately finish progress and clear the progressbar line"""
|
"""immediately end progress, clear the progressbar line, and save end_ts"""
|
||||||
|
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now()
|
||||||
self.stats.update({
|
self.stats.update({
|
||||||
|
@ -591,6 +525,8 @@ class TimedProgress:
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
def download_url(url, timeout=TIMEOUT):
|
def download_url(url, timeout=TIMEOUT):
|
||||||
|
"""Download the contents of a remote url and return the text"""
|
||||||
|
|
||||||
req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
|
req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
|
||||||
|
|
||||||
if CHECK_SSL_VALIDITY:
|
if CHECK_SSL_VALIDITY:
|
||||||
|
@ -615,34 +551,31 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
|
||||||
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
||||||
|
|
||||||
|
|
||||||
def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
|
def chrome_args(**options):
|
||||||
headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
|
|
||||||
check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
|
|
||||||
resolution=RESOLUTION, timeout=TIMEOUT):
|
|
||||||
"""helper to build up a chrome shell command with arguments"""
|
"""helper to build up a chrome shell command with arguments"""
|
||||||
|
|
||||||
cmd_args = [binary]
|
cmd_args = [options['CHROME_BINARY']]
|
||||||
|
|
||||||
if headless:
|
if options['HEADLESS']:
|
||||||
cmd_args += ('--headless',)
|
cmd_args += ('--headless',)
|
||||||
|
|
||||||
if not sandbox:
|
if not options['CHROME_SANDBOX']:
|
||||||
# dont use GPU or sandbox when running inside docker container
|
# dont use GPU or sandbox when running inside docker container
|
||||||
cmd_args += ('--no-sandbox', '--disable-gpu')
|
cmd_args += ('--no-sandbox', '--disable-gpu')
|
||||||
|
|
||||||
if not check_ssl_validity:
|
if not options['CHECK_SSL_VALIDITY']:
|
||||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||||
|
|
||||||
if user_agent:
|
if options['CHROME_USER_AGENT']:
|
||||||
cmd_args += ('--user-agent={}'.format(user_agent),)
|
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
|
||||||
|
|
||||||
if resolution:
|
if options['RESOLUTION']:
|
||||||
cmd_args += ('--window-size={}'.format(RESOLUTION),)
|
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
|
||||||
|
|
||||||
if timeout:
|
if options['TIMEOUT']:
|
||||||
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
|
cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)
|
||||||
|
|
||||||
if user_data_dir:
|
if options['CHROME_USER_DATA_DIR']:
|
||||||
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
|
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||||
|
|
||||||
return cmd_args
|
return cmd_args
|
||||||
|
|
Loading…
Add table
Reference in a new issue