mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-23 00:38:27 +00:00
move pdf, screenshot, dom, singlefile, and ytdlp extractor config to new plugin system
This commit is contained in:
parent
a2a586e369
commit
a5ffd4e9d3
11 changed files with 333 additions and 353 deletions
|
@ -30,6 +30,7 @@ import inspect
|
|||
import getpass
|
||||
import shutil
|
||||
import requests
|
||||
import archivebox
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
|
@ -62,7 +63,6 @@ from .misc.logging import (
|
|||
stderr,
|
||||
hint,
|
||||
)
|
||||
from .misc.checks import check_system_config
|
||||
|
||||
# print('STARTING CONFIG LOADING')
|
||||
|
||||
|
@ -167,15 +167,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||
|
||||
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
||||
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
||||
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
|
||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
||||
'--restrict-filenames',
|
||||
|
@ -267,7 +260,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
|
||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||
|
@ -551,7 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
||||
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
|
||||
|
||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
||||
|
@ -595,7 +587,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||
|
||||
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||
# 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
|
@ -620,15 +612,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
||||
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
||||
|
||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
|
||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
||||
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
|
||||
'CHROME_USER_AGENT': {'default': lambda c: c['CHROME_USER_AGENT'].format(**c)},
|
||||
|
||||
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
|
||||
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
|
||||
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
|
||||
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
|
||||
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
|
||||
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
||||
|
||||
|
@ -638,8 +621,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
||||
|
||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
||||
}
|
||||
|
@ -1183,21 +1165,20 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': config['USE_YOUTUBEDL'],
|
||||
'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
||||
},
|
||||
'CHROME_BINARY': {
|
||||
'path': bin_path(config['CHROME_BINARY']),
|
||||
'version': config['CHROME_VERSION'],
|
||||
'hash': bin_hash(config['CHROME_BINARY']),
|
||||
'enabled': config['USE_CHROME'],
|
||||
'is_valid': bool(config['CHROME_VERSION']),
|
||||
},
|
||||
'RIPGREP_BINARY': {
|
||||
'path': bin_path(config['RIPGREP_BINARY']),
|
||||
'version': config['RIPGREP_VERSION'],
|
||||
'hash': bin_hash(config['RIPGREP_BINARY']),
|
||||
'enabled': config['USE_RIPGREP'],
|
||||
'is_valid': bool(config['RIPGREP_VERSION']),
|
||||
},
|
||||
# TODO: add an entry for the sonic search backend?
|
||||
# 'CHROME_BINARY': {
|
||||
# 'path': bin_path(config['CHROME_BINARY']),
|
||||
# 'version': config['CHROME_VERSION'],
|
||||
# 'hash': bin_hash(config['CHROME_BINARY']),
|
||||
# 'enabled': config['USE_CHROME'],
|
||||
# 'is_valid': bool(config['CHROME_VERSION']),
|
||||
# },
|
||||
# 'RIPGREP_BINARY': {
|
||||
# 'path': bin_path(config['RIPGREP_BINARY']),
|
||||
# 'version': config['RIPGREP_VERSION'],
|
||||
# 'hash': bin_hash(config['RIPGREP_BINARY']),
|
||||
# 'enabled': config['USE_RIPGREP'],
|
||||
# 'is_valid': bool(config['RIPGREP_VERSION']),
|
||||
# },
|
||||
# 'SONIC_BINARY': {
|
||||
# 'path': bin_path(config['SONIC_BINARY']),
|
||||
# 'version': config['SONIC_VERSION'],
|
||||
|
@ -1207,20 +1188,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
# },
|
||||
}
|
||||
|
||||
def get_chrome_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'TIMEOUT': config['TIMEOUT'],
|
||||
'RESOLUTION': config['RESOLUTION'],
|
||||
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
|
||||
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
|
||||
'CHROME_TIMEOUT': config['CHROME_TIMEOUT'],
|
||||
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
|
||||
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
|
||||
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
|
||||
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
|
||||
}
|
||||
|
||||
|
||||
# ******************************************************************************
|
||||
# ******************************************************************************
|
||||
# ******************************** Load Config *********************************
|
||||
|
@ -1264,27 +1231,6 @@ os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # n
|
|||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
||||
sys.path.append(CONFIG.NODE_BIN_PATH)
|
||||
|
||||
# OPTIONAL: also look around the host system for node modules to use
|
||||
# avoid enabling this unless absolutely needed,
|
||||
# having overlapping potential sources of libs is a big source of bugs/confusing to users
|
||||
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
|
||||
# sys.path.append(DEV_NODE_BIN_PATH)
|
||||
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
|
||||
# sys.path.append(USER_NODE_BIN_PATH)
|
||||
|
||||
# disable stderr "you really shouldnt disable ssl" warnings with library config
|
||||
if not CONFIG['CHECK_SSL_VALIDITY']:
|
||||
import urllib3
|
||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# get SQLite database version, compile options, and runtime options
|
||||
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
|
||||
#cursor = sqlite3.connect(':memory:').cursor()
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
|
||||
#cursor.close()
|
||||
|
||||
########################### Config Validity Checkers ###########################
|
||||
|
||||
|
@ -1308,13 +1254,19 @@ def bump_startup_progress_bar():
|
|||
if INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
|
||||
|
||||
|
||||
def setup_django_minimal():
|
||||
sys.path.append(str(archivebox.PACKAGE_DIR))
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
django.setup()
|
||||
|
||||
|
||||
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
|
||||
global INITIAL_STARTUP_PROGRESS
|
||||
global INITIAL_STARTUP_PROGRESS_TASK
|
||||
|
||||
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
||||
check_system_config(config)
|
||||
|
||||
output_dir = out_dir or Path(config['OUTPUT_DIR'])
|
||||
|
||||
|
|
|
@ -8,13 +8,6 @@ from ..system import run, chmod_file, atomic_write
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
chrome_cleanup,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_DOM,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
@ -25,6 +18,8 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
|
@ -33,42 +28,48 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
|||
if (out_dir / get_output_path()).stat().st_size > 1:
|
||||
return False
|
||||
|
||||
return SAVE_DOM
|
||||
return CHROME_CONFIG.SAVE_DOM
|
||||
|
||||
@enforce_types
|
||||
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = get_output_path()
|
||||
output_path = out_dir / output
|
||||
cmd = [
|
||||
*chrome_args(),
|
||||
str(CHROME_BIN.abspath),
|
||||
*CHROME_CONFIG.chrome_args(),
|
||||
'--dump-dom',
|
||||
link.url
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
|
||||
atomic_write(output_path, result.stdout)
|
||||
|
||||
if result.returncode:
|
||||
hints = result.stderr.decode()
|
||||
hints = result.stderr
|
||||
raise ArchiveError('Failed to save DOM', hints)
|
||||
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
chrome_cleanup()
|
||||
CHROME_BINARY.chrome_cleanup_lockfile()
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CHROME_VERSION,
|
||||
cmd_version=str(CHROME_BIN.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -5,20 +5,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
MEDIA_TIMEOUT,
|
||||
SAVE_MEDIA,
|
||||
YOUTUBEDL_ARGS,
|
||||
YOUTUBEDL_EXTRA_ARGS,
|
||||
YOUTUBEDL_BINARY,
|
||||
YOUTUBEDL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
from ..util import enforce_types, is_static_file, dedupe
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
@ -38,6 +25,8 @@ def get_embed_path(archiveresult=None):
|
|||
|
||||
@enforce_types
|
||||
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
|
@ -45,45 +34,52 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
|
|||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_MEDIA
|
||||
return YTDLP_CONFIG.USE_YTDLP
|
||||
|
||||
@enforce_types
|
||||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
||||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||
|
||||
|
||||
# from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
|
||||
|
||||
YTDLP_BIN = YTDLP_BINARY.load()
|
||||
assert YTDLP_BIN.abspath and YTDLP_BIN.version
|
||||
|
||||
timeout = timeout or YTDLP_CONFIG.YTDLP_TIMEOUT
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = get_output_path()
|
||||
output_path = out_dir / output
|
||||
output_path.mkdir(exist_ok=True)
|
||||
# later options take precedence
|
||||
options = [
|
||||
*YOUTUBEDL_ARGS,
|
||||
*YOUTUBEDL_EXTRA_ARGS,
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
*YTDLP_CONFIG.YTDLP_EXTRA_ARGS,
|
||||
*([] if YTDLP_CONFIG.YTDLP_CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||
]
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
str(YTDLP_BIN.abspath),
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
|
||||
result = run(cmd, cwd=str(output_path), timeout=timeout + 1, text=True)
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
or b'HTTP Error 404' in result.stderr
|
||||
or b'HTTP Error 403' in result.stderr
|
||||
or b'URL could be a direct video link' in result.stderr
|
||||
or b'Unable to extract container ID' in result.stderr):
|
||||
if ('ERROR: Unsupported URL' in result.stderr
|
||||
or 'HTTP Error 404' in result.stderr
|
||||
or 'HTTP Error 403' in result.stderr
|
||||
or 'URL could be a direct video link' in result.stderr
|
||||
or 'Unable to extract container ID' in result.stderr):
|
||||
# These happen too frequently on non-media pages to warrant printing to console
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
'Got yt-dlp response code: {}.'.format(result.returncode),
|
||||
*result.stderr.split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to save media', hints)
|
||||
except Exception as err:
|
||||
|
@ -117,7 +113,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=YOUTUBEDL_VERSION,
|
||||
cmd_version=str(YTDLP_BIN.version),
|
||||
output=output,
|
||||
status=status,
|
||||
index_texts=index_texts,
|
||||
|
|
|
@ -8,13 +8,6 @@ from ..system import run, chmod_file
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
chrome_cleanup,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_PDF,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
@ -25,6 +18,8 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
|
@ -32,34 +27,40 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
|||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_PDF
|
||||
return CHROME_CONFIG.SAVE_PDF
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = get_output_path()
|
||||
cmd = [
|
||||
*chrome_args(),
|
||||
str(CHROME_BIN.abspath),
|
||||
*CHROME_CONFIG.chrome_args(),
|
||||
'--print-to-pdf',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
hints = (result.stderr or result.stdout)
|
||||
raise ArchiveError('Failed to save PDF', hints)
|
||||
|
||||
chmod_file(get_output_path(), cwd=str(out_dir))
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
chrome_cleanup()
|
||||
CHROME_BINARY.chrome_cleanup_lockfile()
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
@ -67,7 +68,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CHROME_VERSION,
|
||||
cmd_version=str(CHROME_BINARY.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -5,17 +5,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
chrome_cleanup,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_SCREENSHOT,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..util import enforce_types, is_static_file
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
@ -25,6 +15,8 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
|
@ -32,40 +24,45 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
|
|||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_SCREENSHOT
|
||||
return CHROME_CONFIG.SAVE_SCREENSHOT
|
||||
|
||||
@enforce_types
|
||||
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = get_output_path()
|
||||
cmd = [
|
||||
*chrome_args(),
|
||||
str(CHROME_BIN.abspath),
|
||||
*CHROME_CONFIG.chrome_args(),
|
||||
'--screenshot',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
hints = (result.stderr or result.stdout)
|
||||
raise ArchiveError('Failed to save screenshot', hints)
|
||||
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
chrome_cleanup()
|
||||
CHROME_BINARY.chrome_cleanup_lockfile()
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CHROME_VERSION,
|
||||
cmd_version=str(CHROME_BIN.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -7,22 +7,7 @@ import json
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_SINGLEFILE,
|
||||
DEPENDENCIES,
|
||||
SINGLEFILE_VERSION,
|
||||
SINGLEFILE_ARGS,
|
||||
SINGLEFILE_EXTRA_ARGS,
|
||||
CHROME_BINARY,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
from ..util import enforce_types, is_static_file, dedupe
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
@ -32,6 +17,8 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
|
@ -39,30 +26,35 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
|
|||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_SINGLEFILE
|
||||
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""download full site using single-file"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
|
||||
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
SINGLEFILE_BIN = SINGLEFILE_BINARY.load()
|
||||
assert SINGLEFILE_BIN.abspath and SINGLEFILE_BIN.version
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output = get_output_path()
|
||||
|
||||
browser_args = chrome_args(CHROME_TIMEOUT=0)
|
||||
browser_args = CHROME_CONFIG.chrome_args(CHROME_TIMEOUT=0)
|
||||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||
# later options take precedence
|
||||
options = [
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
|
||||
browser_args,
|
||||
*SINGLEFILE_ARGS,
|
||||
*SINGLEFILE_EXTRA_ARGS,
|
||||
'--browser-executable-path={}'.format(CHROME_BIN.abspath),
|
||||
*(["--browser-cookies-file={}".format(SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE)] if SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE else []),
|
||||
'--browser-args={}'.format(json.dumps(browser_args)),
|
||||
*SINGLEFILE_CONFIG.SINGLEFILE_EXTRA_ARGS,
|
||||
]
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
str(SINGLEFILE_BIN.abspath),
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
output,
|
||||
|
@ -72,13 +64,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
timer = TimedProgress(timeout, prefix=' ')
|
||||
result = None
|
||||
try:
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True, capture_output=True)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
||||
for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
|
||||
if line.strip()
|
||||
]
|
||||
hints = (
|
||||
|
@ -93,9 +85,9 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
||||
cmd[2] = browser_args.replace('"', "\\\"")
|
||||
cmd[2] = cmd[2].replace('"', "\\\"")
|
||||
if result:
|
||||
err.hints = (result.stdout + result.stderr).decode().split('\n')
|
||||
err.hints = (result.stdout + result.stderr).split('\n')
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
@ -103,7 +95,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=SINGLEFILE_VERSION,
|
||||
cmd_version=str(SINGLEFILE_BIN.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import shutil
|
||||
import platform
|
||||
import archivebox
|
||||
|
||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||
from pathlib import Path
|
||||
|
@ -69,6 +69,7 @@ from .extractors import archive_links, archive_link, ignore_methods
|
|||
from .misc.logging import stderr, hint
|
||||
from .misc.checks import check_data_folder, check_dependencies
|
||||
from .config import (
|
||||
setup_django_minimal,
|
||||
ConfigDict,
|
||||
ANSI,
|
||||
IS_TTY,
|
||||
|
@ -81,8 +82,6 @@ from .config import (
|
|||
TIMEZONE,
|
||||
ENFORCE_ATOMIC_WRITES,
|
||||
OUTPUT_PERMISSIONS,
|
||||
PYTHON_BINARY,
|
||||
ARCHIVEBOX_BINARY,
|
||||
ONLY_NEW,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR,
|
||||
|
@ -95,31 +94,22 @@ from .config import (
|
|||
HTML_INDEX_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
ALLOWED_IN_OUTPUT_DIR,
|
||||
SEARCH_BACKEND_ENGINE,
|
||||
LDAP,
|
||||
get_version,
|
||||
write_config_file,
|
||||
VERSION,
|
||||
VERSIONS_AVAILABLE,
|
||||
CAN_UPGRADE,
|
||||
COMMIT_HASH,
|
||||
BUILD_TIME,
|
||||
CODE_LOCATIONS,
|
||||
DATA_LOCATIONS,
|
||||
DEPENDENCIES,
|
||||
CHROME_BINARY,
|
||||
CHROME_VERSION,
|
||||
YOUTUBEDL_BINARY,
|
||||
YOUTUBEDL_VERSION,
|
||||
SINGLEFILE_VERSION,
|
||||
READABILITY_VERSION,
|
||||
MERCURY_VERSION,
|
||||
NODE_VERSION,
|
||||
load_all_config,
|
||||
CONFIG,
|
||||
USER_CONFIG,
|
||||
ADMIN_USERNAME,
|
||||
ADMIN_PASSWORD,
|
||||
get_real_name,
|
||||
setup_django,
|
||||
)
|
||||
|
@ -216,6 +206,11 @@ def version(quiet: bool=False,
|
|||
out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Print the ArchiveBox version and dependency information"""
|
||||
|
||||
setup_django_minimal()
|
||||
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
|
||||
from plugins_auth.ldap.apps import LDAP_CONFIG
|
||||
from django.conf import settings
|
||||
|
||||
print(VERSION)
|
||||
|
||||
if not quiet:
|
||||
|
@ -227,7 +222,7 @@ def version(quiet: bool=False,
|
|||
|
||||
p = platform.uname()
|
||||
print(
|
||||
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
||||
'ArchiveBox v{}'.format(archivebox.__version__),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={BUILD_TIME}',
|
||||
)
|
||||
|
@ -241,29 +236,35 @@ def version(quiet: bool=False,
|
|||
)
|
||||
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
||||
print(
|
||||
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
f'FS_USER={PUID}:{PGID}',
|
||||
f'FS_PERMS={OUTPUT_PERMISSIONS}',
|
||||
f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
)
|
||||
print(
|
||||
f'DEBUG={DEBUG}',
|
||||
f'IS_TTY={IS_TTY}',
|
||||
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||
f'TZ={TIMEZONE}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
|
||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
)
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
print('{white}[i] Old dependency versions:{reset}'.format(**ANSI))
|
||||
for name, dependency in DEPENDENCIES.items():
|
||||
print(printable_dependency_version(name, dependency))
|
||||
|
||||
# add a newline between core dependencies and extractor dependencies for easier reading
|
||||
if name == 'ARCHIVEBOX_BINARY':
|
||||
print()
|
||||
|
||||
|
||||
print()
|
||||
print('{white}[i] New dependency versions:{reset}'.format(**ANSI))
|
||||
for name, binary in settings.BINARIES.items():
|
||||
loaded_bin = binary.load()
|
||||
print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath)
|
||||
|
||||
print()
|
||||
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
||||
for name, path in CODE_LOCATIONS.items():
|
||||
|
@ -431,10 +432,11 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
|
|||
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from plugins_sys.config.apps import SERVER_CONFIG
|
||||
|
||||
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
|
||||
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
|
||||
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||
|
||||
if existing_index:
|
||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||
|
@ -693,8 +695,8 @@ def add(urls: Union[str, List[str]],
|
|||
|
||||
# tail_worker_logs(worker['stdout_logfile'])
|
||||
|
||||
if CAN_UPGRADE:
|
||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
# if CAN_UPGRADE:
|
||||
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
return new_links
|
||||
|
||||
|
@ -967,6 +969,8 @@ def list_folders(links: List[Link],
|
|||
def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Automatically install all ArchiveBox dependencies and extras"""
|
||||
|
||||
|
||||
|
||||
if not (out_dir / ARCHIVE_DIR_NAME).exists():
|
||||
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
|
||||
|
@ -980,24 +984,26 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
|
||||
stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green')
|
||||
|
||||
from plugins_pkg.pip.apps import PYTHON_BINARY
|
||||
|
||||
stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...')
|
||||
if YOUTUBEDL_VERSION:
|
||||
print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY)
|
||||
else:
|
||||
try:
|
||||
run_shell([
|
||||
PYTHON_BINARY, '-m', 'pip',
|
||||
PYTHON_BINARY.load().abspath, '-m', 'pip',
|
||||
'install',
|
||||
'--upgrade',
|
||||
'--no-cache-dir',
|
||||
'--no-warn-script-location',
|
||||
'yt-dlp',
|
||||
], capture_output=False, cwd=out_dir)
|
||||
], capture_output=False, cwd=out_dir, text=True)
|
||||
pkg_path = run_shell([
|
||||
PYTHON_BINARY, '-m', 'pip',
|
||||
PYTHON_BINARY.load().abspath, '-m', 'pip',
|
||||
'show',
|
||||
'yt-dlp',
|
||||
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
|
||||
], capture_output=True, text=True, cwd=out_dir).stdout.split('Location: ')[-1].split('\n', 1)[0]
|
||||
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
|
||||
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
|
||||
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
|
||||
|
@ -1006,33 +1012,18 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
stderr(f'[X] Failed to install python packages: {e}', color='red')
|
||||
raise SystemExit(1)
|
||||
|
||||
if platform.machine() == 'armv7l':
|
||||
stderr('\n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7.')
|
||||
else:
|
||||
stderr('\n Installing CHROME_BINARY automatically using playwright...')
|
||||
if CHROME_VERSION:
|
||||
print(f'{CHROME_VERSION} is already installed', CHROME_BINARY)
|
||||
else:
|
||||
try:
|
||||
run_shell([
|
||||
PYTHON_BINARY, '-m', 'pip',
|
||||
'install',
|
||||
'--upgrade',
|
||||
'--no-cache-dir',
|
||||
'--no-warn-script-location',
|
||||
'playwright',
|
||||
], capture_output=False, cwd=out_dir)
|
||||
run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir)
|
||||
proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir)
|
||||
NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip()
|
||||
assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path'
|
||||
config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir)
|
||||
except BaseException as e: # lgtm [py/catch-base-exception]
|
||||
stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red')
|
||||
raise SystemExit(1)
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_BINARY
|
||||
|
||||
CHROME_BINARY.load_or_install()
|
||||
|
||||
from plugins_pkg.npm.apps import NPM_BINARY
|
||||
from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
|
||||
|
||||
SINGLEFILE_BINARY.load_or_install()
|
||||
|
||||
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
|
||||
if not NODE_VERSION:
|
||||
if not NPM_BINARY.load().version:
|
||||
stderr('[X] You must first install node & npm using your system package manager', color='red')
|
||||
hint([
|
||||
'https://github.com/nodesource/distributions#table-of-contents',
|
||||
|
@ -1077,7 +1068,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
|
||||
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
|
||||
|
||||
run_shell([PYTHON_BINARY, ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir)
|
||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
|
||||
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
|
||||
|
||||
@enforce_types
|
||||
def config(config_options_str: Optional[str]=None,
|
||||
|
@ -1192,6 +1185,8 @@ def schedule(add: bool=False,
|
|||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
setup_django_minimal()
|
||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
|
||||
Path(LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
|
@ -1212,7 +1207,7 @@ def schedule(add: bool=False,
|
|||
'cd',
|
||||
quoted(out_dir),
|
||||
'&&',
|
||||
quoted(ARCHIVEBOX_BINARY),
|
||||
quoted(ARCHIVEBOX_BINARY.load().abspath),
|
||||
*([
|
||||
'add',
|
||||
*(['--overwrite'] if overwrite else []),
|
||||
|
@ -1300,8 +1295,8 @@ def schedule(add: bool=False,
|
|||
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
||||
raise SystemExit(1)
|
||||
|
||||
if CAN_UPGRADE:
|
||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
# if CAN_UPGRADE:
|
||||
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -1386,6 +1381,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
"""Run an ArchiveBox Django management command"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
setup_django_minimal()
|
||||
from django.core.management import execute_from_command_line
|
||||
|
||||
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
||||
|
@ -1393,7 +1389,9 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr('')
|
||||
|
||||
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
|
||||
execute_from_command_line([ARCHIVEBOX_BINARY.load().abspath, 'manage', *(args or ['help'])])
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
__package__ = 'archivebox.plugins_extractor.chrome'
|
||||
|
||||
import sys
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, ClassVar
|
||||
|
@ -7,7 +8,8 @@ from typing import List, Optional, Dict, ClassVar
|
|||
from django.conf import settings
|
||||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from pydantic import InstanceOf, Field
|
||||
from rich import print
|
||||
from pydantic import InstanceOf, Field, model_validator
|
||||
from pydantic_pkgr import (
|
||||
BinProvider,
|
||||
BinName,
|
||||
|
@ -25,9 +27,12 @@ from plugantic.base_binary import BaseBinary, env
|
|||
from plugantic.base_hook import BaseHook
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from plugins_sys.config.apps import ARCHIVING_CONFIG, SHELL_CONFIG
|
||||
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
||||
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
from ...util import dedupe
|
||||
|
||||
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
"chromium",
|
||||
|
@ -82,11 +87,113 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
|
|||
class ChromeConfig(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
|
||||
|
||||
CHROME_BINARY: str = Field(default='chrome')
|
||||
CHROME_ARGS: List[str] | None = Field(default=None)
|
||||
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
|
||||
CHROME_DEFAULT_ARGS: List[str] = Field(default=lambda: ['--timeout={TIMEOUT-10}'])
|
||||
USE_CHROME: bool = Field(default=True)
|
||||
|
||||
# Chrome Binary
|
||||
CHROME_BINARY: str = Field(default='chrome')
|
||||
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
|
||||
|
||||
# Chrome Options Tuning
|
||||
CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
|
||||
CHROME_HEADLESS: bool = Field(default=True)
|
||||
CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
|
||||
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
|
||||
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
|
||||
# Cookies & Auth
|
||||
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
CHROME_USER_DATA_DIR: Path | None = Field(default=None)
|
||||
CHROME_PROFILE_NAME: str = Field(default='Default')
|
||||
|
||||
# Extractor Toggles
|
||||
SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
|
||||
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
|
||||
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_use_chrome(self):
|
||||
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
||||
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
|
||||
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
# if user has specified a user data dir, make sure its valid
|
||||
if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists():
|
||||
# check to make sure user_data_dir/<profile_name> exists
|
||||
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists():
|
||||
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
|
||||
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
|
||||
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
|
||||
print(' For more info see:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
|
||||
if '/Default' in str(self.CHROME_USER_DATA_DIR):
|
||||
print(file=sys.stderr)
|
||||
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
|
||||
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
|
||||
|
||||
# hard error is too annoying here, instead just set it to nothing
|
||||
# raise SystemExit(2)
|
||||
self.CHROME_USER_DATA_DIR = None
|
||||
else:
|
||||
self.CHROME_USER_DATA_DIR = None
|
||||
|
||||
return self
|
||||
|
||||
def chrome_args(self, **options) -> List[str]:
|
||||
"""helper to build up a chrome shell command with arguments"""
|
||||
|
||||
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
|
||||
options = self.model_copy(update=options)
|
||||
|
||||
cmd_args = [*options.CHROME_EXTRA_ARGS]
|
||||
|
||||
if options.CHROME_HEADLESS:
|
||||
cmd_args += ["--headless=new"] # expects chrome version >= 111
|
||||
|
||||
if not options.CHROME_SANDBOX:
|
||||
# assume this means we are running inside a docker container
|
||||
# in docker, GPU support is limited, sandboxing is unecessary,
|
||||
# and SHM is limited to 64MB by default (which is too low to be usable).
|
||||
cmd_args += (
|
||||
"--no-sandbox",
|
||||
"--no-zygote",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-software-rasterizer",
|
||||
"--run-all-compositor-stages-before-draw",
|
||||
"--hide-scrollbars",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-first-run",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--disable-sync",
|
||||
# "--password-store=basic",
|
||||
)
|
||||
|
||||
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
|
||||
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
|
||||
|
||||
# set window size for screenshot/pdf/etc. rendering
|
||||
cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
|
||||
|
||||
if not options.CHROME_CHECK_SSL_VALIDITY:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
||||
if options.CHROME_USER_AGENT:
|
||||
cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
|
||||
|
||||
if options.CHROME_TIMEOUT:
|
||||
cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
|
||||
|
||||
if options.CHROME_USER_DATA_DIR:
|
||||
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
|
||||
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME))
|
||||
|
||||
return dedupe(cmd_args)
|
||||
|
||||
CHROME_CONFIG = ChromeConfig()
|
||||
|
||||
|
@ -122,6 +229,18 @@ class ChromeBinary(BaseBinary):
|
|||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.symlink_to(binary.abspath)
|
||||
|
||||
@staticmethod
|
||||
def chrome_cleanup_lockfile():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
lock_file = Path("~/.config/chromium/SingletonLock")
|
||||
|
||||
if SHELL_CONFIG.IN_DOCKER and lock_file.exists():
|
||||
lock_file.unlink()
|
||||
|
||||
|
||||
|
||||
CHROME_BINARY = ChromeBinary()
|
||||
|
||||
|
|
|
@ -24,40 +24,21 @@ from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
|||
|
||||
###################### Config ##########################
|
||||
|
||||
class SinglefileToggleConfigs(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_TOGGLES'
|
||||
class SinglefileConfig(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
|
||||
|
||||
SAVE_SINGLEFILE: bool = True
|
||||
|
||||
|
||||
class SinglefileOptionsConfigs(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_OPTIONS'
|
||||
|
||||
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
|
||||
class SinglefileDependencyConfigs(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
|
||||
|
||||
SINGLEFILE_BINARY: str = Field(default='wget')
|
||||
SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
|
||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||
SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||
|
||||
class SinglefileConfigs(SinglefileToggleConfigs, SinglefileOptionsConfigs, SinglefileDependencyConfigs):
|
||||
# section: ClassVar[ConfigSectionName] = 'ALL_CONFIGS'
|
||||
pass
|
||||
|
||||
DEFAULT_GLOBAL_CONFIG = {
|
||||
'CHECK_SSL_VALIDITY': False,
|
||||
'SAVE_SINGLEFILE': True,
|
||||
'TIMEOUT': 120,
|
||||
}
|
||||
|
||||
SINGLEFILE_CONFIG = SinglefileConfigs(**DEFAULT_GLOBAL_CONFIG)
|
||||
SINGLEFILE_CONFIG = SinglefileConfig()
|
||||
|
||||
|
||||
SINGLEFILE_MIN_VERSION = '1.1.54'
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import sys
|
||||
from typing import List, Dict, ClassVar
|
||||
from subprocess import run, PIPE
|
||||
from pydantic import InstanceOf, Field
|
||||
|
||||
from pydantic import InstanceOf, Field, model_validator, AliasChoices
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
@ -10,20 +12,37 @@ from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
|||
from plugantic.base_binary import BaseBinary, env, apt, brew
|
||||
from plugantic.base_hook import BaseHook
|
||||
|
||||
from plugins_sys.config.apps import ARCHIVING_CONFIG
|
||||
from plugins_pkg.pip.apps import pip
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class YtdlpDependencyConfigs(BaseConfigSet):
|
||||
class YtdlpConfig(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
|
||||
|
||||
USE_YTDLP: bool = True
|
||||
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
||||
|
||||
YTDLP_BINARY: str = Field(default='yt-dlp')
|
||||
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
||||
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
|
||||
|
||||
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_use_ytdlp(self):
|
||||
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
|
||||
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
|
||||
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
return self
|
||||
|
||||
DEFAULT_GLOBAL_CONFIG = {}
|
||||
YTDLP_CONFIG = YtdlpDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
|
||||
|
||||
YTDLP_CONFIG = YtdlpConfig()
|
||||
|
||||
|
||||
|
||||
|
@ -31,6 +50,9 @@ class YtdlpBinary(BaseBinary):
|
|||
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
|
||||
|
||||
YTDLP_BINARY = YtdlpBinary()
|
||||
|
||||
|
||||
class FfmpegBinary(BaseBinary):
|
||||
name: BinName = 'ffmpeg'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
@ -53,10 +75,9 @@ class FfmpegBinary(BaseBinary):
|
|||
# def get_ffmpeg_version(self) -> Optional[str]:
|
||||
# return self.exec(cmd=['-version']).stdout
|
||||
|
||||
|
||||
YTDLP_BINARY = YtdlpBinary()
|
||||
FFMPEG_BINARY = FfmpegBinary()
|
||||
|
||||
|
||||
# class YtdlpExtractor(BaseExtractor):
|
||||
# name: str = 'ytdlp'
|
||||
# binary: str = 'ytdlp'
|
||||
|
|
|
@ -18,8 +18,6 @@ from requests.exceptions import RequestException, ReadTimeout
|
|||
|
||||
from base32_crockford import encode as base32_encode # type: ignore
|
||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||
from os.path import lexists
|
||||
from os import remove as remove_file
|
||||
|
||||
try:
|
||||
import chardet
|
||||
|
@ -282,82 +280,6 @@ def get_headers(url: str, timeout: int=None) -> str:
|
|||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def chrome_args(**options) -> List[str]:
|
||||
"""helper to build up a chrome shell command with arguments"""
|
||||
|
||||
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
|
||||
from .config import (
|
||||
CHROME_OPTIONS,
|
||||
CHROME_VERSION,
|
||||
CHROME_EXTRA_ARGS,
|
||||
)
|
||||
|
||||
options = {**CHROME_OPTIONS, **options}
|
||||
|
||||
if not options['CHROME_BINARY']:
|
||||
raise Exception('Could not find any CHROME_BINARY installed on your system')
|
||||
|
||||
cmd_args = [options['CHROME_BINARY']]
|
||||
|
||||
cmd_args += CHROME_EXTRA_ARGS
|
||||
|
||||
if options['CHROME_HEADLESS']:
|
||||
cmd_args += ("--headless=new",) # expects chrome version >= 111
|
||||
|
||||
if not options['CHROME_SANDBOX']:
|
||||
# assume this means we are running inside a docker container
|
||||
# in docker, GPU support is limited, sandboxing is unecessary,
|
||||
# and SHM is limited to 64MB by default (which is too low to be usable).
|
||||
cmd_args += (
|
||||
"--no-sandbox",
|
||||
"--no-zygote",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-software-rasterizer",
|
||||
"--run-all-compositor-stages-before-draw",
|
||||
"--hide-scrollbars",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-first-run",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--disable-sync",
|
||||
# "--password-store=basic",
|
||||
)
|
||||
|
||||
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
|
||||
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
|
||||
|
||||
# set window size for screenshot/pdf/etc. rendering
|
||||
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
|
||||
|
||||
if not options['CHECK_SSL_VALIDITY']:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
||||
if options['CHROME_USER_AGENT']:
|
||||
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
|
||||
|
||||
if options['CHROME_TIMEOUT']:
|
||||
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
|
||||
|
||||
if options['CHROME_USER_DATA_DIR']:
|
||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||
cmd_args.append('--profile-directory=Default')
|
||||
|
||||
return dedupe(cmd_args)
|
||||
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
|
||||
from .config import IN_DOCKER
|
||||
|
||||
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
||||
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
||||
|
||||
@enforce_types
|
||||
def ansi_to_html(text: str) -> str:
|
||||
"""
|
||||
|
|
Loading…
Add table
Reference in a new issue