speed up startup time, add rich startup progressbar, split logging and checks into misc, fix search index import backend bug

This commit is contained in:
Nick Sweeting 2024-09-24 19:04:38 -07:00
parent 7ffb81f61b
commit 64c7100cf9
No known key found for this signature in database
22 changed files with 566 additions and 762 deletions

View file

@ -1,5 +1,7 @@
__package__ = 'archivebox'
# print('INSTALLING MONKEY PATCHES')
from .monkey_patches import *
import os
@ -28,3 +30,5 @@ def _detect_installed_version():
__version__ = _detect_installed_version()
# print('DONE INSTALLING MONKEY PATCHES')

View file

@ -1,16 +1,20 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
import os
import sys
import argparse
import threading
from time import sleep
import archivebox
from typing import Optional, Dict, List, IO, Union, Iterable
from time import sleep
from collections.abc import Mapping
from typing import Optional, List, IO, Union, Iterable
from pathlib import Path
from ..config import OUTPUT_DIR, check_data_folder, check_migrations, stderr
from ..misc.checks import check_data_folder, check_migrations
from ..misc.logging import stderr
from importlib import import_module
@ -18,13 +22,46 @@ BUILTIN_LIST = list
CLI_DIR = Path(__file__).resolve().parent
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version') # dont require valid data folder at all
main_cmds = ('init', 'config', 'setup') # dont require existing db present
archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present
fake_db = ("oneshot",) # use fake in-memory db
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
# def list_subcommands() -> Dict[str, str]:
# """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
# COMMANDS = []
# for filename in os.listdir(CLI_DIR):
# if is_cli_module(filename):
# subcommand = filename.replace('archivebox_', '').replace('.py', '')
# module = import_module('.archivebox_{}'.format(subcommand), __package__)
# assert is_valid_cli_module(module, subcommand)
# COMMANDS.append((subcommand, module.main.__doc__))
# globals()[subcommand] = module.main
# display_order = lambda cmd: (
# display_first.index(cmd[0])
# if cmd[0] in display_first else
# 100 + len(cmd[0])
# )
# return dict(sorted(COMMANDS, key=display_order))
# just define it statically, it's much faster:
SUBCOMMAND_MODULES = {
'help': 'archivebox_help',
'version': 'archivebox_version' ,
'init': 'archivebox_init',
'config': 'archivebox_config',
'setup': 'archivebox_setup',
'add': 'archivebox_add',
'remove': 'archivebox_remove',
'update': 'archivebox_update',
'list': 'archivebox_list',
'status': 'archivebox_status',
'schedule': 'archivebox_schedule',
'server': 'archivebox_server',
'shell': 'archivebox_shell',
'manage': 'archivebox_manage',
'oneshot': 'archivebox_oneshot',
}
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
@ -36,6 +73,38 @@ is_valid_cli_module = lambda module, subcommand: (
and module.__command__.split(' ')[-1] == subcommand
)
class LazySubcommands(Mapping):
def keys(self):
return SUBCOMMAND_MODULES.keys()
def values(self):
return [self[key] for key in self.keys()]
def items(self):
return [(key, self[key]) for key in self.keys()]
def __getitem__(self, key):
module = import_module(f'.{SUBCOMMAND_MODULES[key]}', __package__)
assert is_valid_cli_module(module, key)
return module.main
def __iter__(self):
return iter(SUBCOMMAND_MODULES.keys())
def __len__(self):
return len(SUBCOMMAND_MODULES)
CLI_SUBCOMMANDS = LazySubcommands()
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version') # dont require valid data folder at all
main_cmds = ('init', 'config', 'setup') # dont require existing db present
archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present
fake_db = ("oneshot",) # use fake in-memory db
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
@ -71,29 +140,9 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It
raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}')
def list_subcommands() -> Dict[str, str]:
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
COMMANDS = []
for filename in os.listdir(CLI_DIR):
if is_cli_module(filename):
subcommand = filename.replace('archivebox_', '').replace('.py', '')
module = import_module('.archivebox_{}'.format(subcommand), __package__)
assert is_valid_cli_module(module, subcommand)
COMMANDS.append((subcommand, module.main.__doc__))
globals()[subcommand] = module.main
display_order = lambda cmd: (
display_first.index(cmd[0])
if cmd[0] in display_first else
100 + len(cmd[0])
)
return dict(sorted(COMMANDS, key=display_order))
def run_subcommand(subcommand: str,
subcommand_args: List[str]=None,
subcommand_args: List[str] | None = None,
stdin: Optional[IO]=None,
pwd: Union[Path, str, None]=None) -> None:
"""Run a given ArchiveBox subcommand with the given list of args"""
@ -101,18 +150,18 @@ def run_subcommand(subcommand: str,
subcommand_args = subcommand_args or []
if subcommand not in meta_cmds:
from ..config import setup_django
from ..config import setup_django, CONFIG
cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
if cmd_requires_db:
check_data_folder(pwd)
check_data_folder(CONFIG)
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
if cmd_requires_db:
check_migrations()
check_migrations(CONFIG)
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
@ -121,17 +170,28 @@ def run_subcommand(subcommand: str,
wait_for_bg_threads_to_exit(timeout=60)
SUBCOMMANDS = list_subcommands()
class NotProvided:
pass
def __len__(self):
return 0
def __bool__(self):
return False
def __repr__(self):
return '<not provided>'
Omitted = Union[None, NotProvided]
OMITTED = NotProvided()
def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, pwd: Optional[str]=None) -> None:
args = sys.argv[1:] if args is NotProvided else args
stdin = sys.stdin if stdin is NotProvided else stdin
def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: str | None=None) -> None:
# print('STARTING CLI MAIN ENTRYPOINT')
args = sys.argv[1:] if args is OMITTED else args
stdin = sys.stdin if stdin is OMITTED else stdin
subcommands = list_subcommands()
parser = argparse.ArgumentParser(
prog=__command__,
description='ArchiveBox: The self-hosted internet archive',
@ -141,19 +201,19 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
group.add_argument(
'--help', '-h',
action='store_true',
help=subcommands['help'],
help=CLI_SUBCOMMANDS['help'].__doc__,
)
group.add_argument(
'--version',
action='store_true',
help=subcommands['version'],
help=CLI_SUBCOMMANDS['version'].__doc__,
)
group.add_argument(
"subcommand",
type=str,
help= "The name of the subcommand to run",
nargs='?',
choices=subcommands.keys(),
choices=CLI_SUBCOMMANDS.keys(),
default=None,
)
parser.add_argument(
@ -174,23 +234,13 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
log_cli_command(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin,
pwd=pwd or OUTPUT_DIR
stdin=stdin or None,
pwd=pwd or archivebox.DATA_DIR,
)
run_subcommand(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin,
pwd=pwd or OUTPUT_DIR,
stdin=stdin or None,
pwd=pwd or archivebox.DATA_DIR,
)
__all__ = (
'SUBCOMMANDS',
'list_subcommands',
'run_subcommand',
*SUBCOMMANDS.keys(),
)

View file

@ -28,21 +28,19 @@ import sys
import json
import inspect
import getpass
import platform
import shutil
import requests
from hashlib import md5
from pathlib import Path
from benedict import benedict
from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict, Union, List
from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
from configparser import ConfigParser
from collections import defaultdict
import importlib.metadata
from pydantic_pkgr import SemVer
from rich.progress import Progress
import django
from django.db.backends.sqlite3.base import Database as sqlite3
@ -56,6 +54,17 @@ from .config_stubs import (
ConfigDefaultDict,
)
from .misc.logging import (
CONSOLE,
SHOW_PROGRESS,
DEFAULT_CLI_COLORS,
ANSI,
COLOR_DICT,
stderr,
hint,
)
from .misc.checks import check_system_config
# print('STARTING CONFIG LOADING')
# load fallback libraries from vendor dir
@ -70,7 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SHELL_CONFIG': {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False},
'IN_QEMU': {'type': bool, 'default': False},
'PUID': {'type': int, 'default': os.getuid()},
@ -306,32 +315,7 @@ ROBOTS_TXT_FILENAME = 'robots.txt'
FAVICON_FILENAME = 'favicon.ico'
CONFIG_FILENAME = 'ArchiveBox.conf'
DEFAULT_CLI_COLORS = benedict(
{
"reset": "\033[00;00m",
"lightblue": "\033[01;30m",
"lightyellow": "\033[01;33m",
"lightred": "\033[01;35m",
"red": "\033[01;31m",
"green": "\033[01;32m",
"blue": "\033[01;34m",
"white": "\033[01;37m",
"black": "\033[01;30m",
}
)
ANSI = AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
'00': [(0, 0, 0), (0, 0, 0)],
'30': [(0, 0, 0), (0, 0, 0)],
'31': [(255, 0, 0), (128, 0, 0)],
'32': [(0, 200, 0), (0, 128, 0)],
'33': [(255, 255, 0), (128, 128, 0)],
'34': [(0, 0, 255), (0, 0, 128)],
'35': [(255, 0, 255), (128, 0, 128)],
'36': [(0, 255, 255), (0, 128, 128)],
'37': [(255, 255, 255), (255, 255, 255)],
})
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extensions are static files
@ -880,37 +864,6 @@ def parse_version_string(version: str) -> Tuple[int, int, int]:
return tuple(int(part) for part in base.split('.'))[:3]
# Logging Helpers
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color:
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
else:
strs = [' '.join(str(a) for a in args), '\n']
sys.stdout.write(prefix + ''.join(strs))
def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color:
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
else:
strs = [' '.join(str(a) for a in args), '\n']
sys.stderr.write(prefix + ''.join(strs))
def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if isinstance(text, str):
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
else:
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
for line in text[1:]:
stderr('{} {}'.format(prefix, line))
# Dependency Metadata Helpers
def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]:
@ -919,6 +872,10 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3)
abspath = bin_path(binary)
if not binary or not abspath:
return None
return '999.999.999'
# Now handled by new BinProvider plugin system, no longer needed:
try:
bin_env = os.environ | {'LANG': 'C'}
@ -960,6 +917,9 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
return 'UNUSED'
# DEPRECATED: now handled by new BinProvider plugin system, no longer needed:
if binary is None:
return None
abs_path = bin_path(binary)
@ -1329,246 +1289,123 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
########################### Config Validity Checkers ###########################
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
def check_system_config(config: ConfigDict=CONFIG) -> None:
### Check system environment
if config['USER'] == 'root' or str(config['PUID']) == "0":
stderr('[!] ArchiveBox should never be run as root!', color='red')
stderr(' For more information, see the security overview documentation:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
if config['IN_DOCKER']:
attempted_command = ' '.join(sys.argv[:3])
stderr('')
stderr(' {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI']))
stderr(f' docker compose run archivebox {attempted_command}')
stderr(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}')
stderr(' or:')
stderr(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"')
stderr(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"')
raise SystemExit(2)
### Check Python environment
if sys.version_info[:3] < (3, 7, 0):
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(2)
if int(CONFIG['DJANGO_VERSION'].split('.')[0]) < 3:
stderr(f'[X] Django version is not new enough: {config["DJANGO_VERSION"]} (>3.0 is required)', color='red')
stderr(' Upgrade django using pip or your system package manager: pip3 install --upgrade django')
raise SystemExit(2)
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
stderr('')
stderr(' Confirm that it\'s fixed by opening a new shell and running:')
stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
raise SystemExit(2)
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists():
if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
stderr(' For more info see:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
stderr()
stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
config['CHROME_USER_DATA_DIR'] = None
else:
config['CHROME_USER_DATA_DIR'] = None
def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
invalid_dependencies = [
(name, info) for name, info in config['DEPENDENCIES'].items()
if info['enabled'] and not info['is_valid']
]
if invalid_dependencies and show_help:
stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
for dependency, info in invalid_dependencies:
stderr(
' ! {}: {} ({})'.format(
dependency,
info['path'] or 'unable to find binary',
info['version'] or 'unable to detect version',
)
)
if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
hint(('To install all packages automatically run: archivebox setup',
f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
''), prefix=' ')
stderr('')
if config['TIMEOUT'] < 5:
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
stderr(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
stderr(' (Setting it somewhere over 60 seconds is recommended)')
stderr()
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
stderr()
def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
output_dir = out_dir or config['OUTPUT_DIR']
assert isinstance(output_dir, (str, Path))
archive_dir_exists = (Path(output_dir) / ARCHIVE_DIR_NAME).exists()
if not archive_dir_exists:
stderr('[X] No archivebox index found in the current directory.', color='red')
stderr(f' {output_dir}', color='lightyellow')
stderr()
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]')
stderr()
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
stderr(' archivebox init')
raise SystemExit(2)
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
output_dir = out_dir or config['OUTPUT_DIR']
from .index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
if pending_migrations:
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
stderr(f' {output_dir}')
stderr()
stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
stderr(' archivebox init')
raise SystemExit(3)
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True)
(Path(output_dir) / LIB_DIR_NAME / 'bin').mkdir(exist_ok=True, parents=True)
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True, parents=True)
def bump_startup_progress_bar():
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
check_system_config()
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
check_system_config(config)
output_dir = out_dir or Path(config['OUTPUT_DIR'])
output_dir = out_dir or Path(config['OUTPUT_DIR'])
assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
try:
from django.core.management import call_command
sys.path.append(str(config['PACKAGE_DIR']))
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
# Check to make sure JSON extension is available in our Sqlite3 instance
bump_startup_progress_bar()
try:
cursor = sqlite3.connect(':memory:').cursor()
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
except sqlite3.OperationalError as exc:
stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
hint([
'Upgrade your Python version or install the extension manually:',
'https://code.djangoproject.com/wiki/JSON1Extension'
])
from django.core.management import call_command
if in_memory_db:
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
django.setup()
call_command("migrate", interactive=False, verbosity=0)
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django.setup()
sys.path.append(str(config['PACKAGE_DIR']))
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
from django.conf import settings
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
if check_db:
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor.execute("PRAGMA busy_timeout = 5000;")
cursor.execute("PRAGMA synchronous = NORMAL;")
# Create cache table in DB if needed
# Check to make sure JSON extension is available in our Sqlite3 instance
try:
from django.core.cache import cache
cache.get('test', None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
cursor = sqlite3.connect(':memory:').cursor()
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
except sqlite3.OperationalError as exc:
stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
hint([
'Upgrade your Python version or install the extension manually:',
'https://code.djangoproject.com/wiki/JSON1Extension'
])
bump_startup_progress_bar()
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
if in_memory_db:
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
django.setup()
bump_startup_progress_bar()
call_command("migrate", interactive=False, verbosity=0)
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django.setup()
bump_startup_progress_bar()
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
from django.conf import settings
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
if settings.DEBUG_LOGFIRE:
from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
SQLite3Instrumentor().instrument()
if check_db:
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
import logfire
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
logfire.configure()
logfire.instrument_django(is_sql_commentor_enabled=True)
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor.execute("PRAGMA busy_timeout = 5000;")
cursor.execute("PRAGMA synchronous = NORMAL;")
except KeyboardInterrupt:
raise SystemExit(2)
# Create cache table in DB if needed
try:
from django.core.cache import cache
cache.get('test', None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
bump_startup_progress_bar()
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
bump_startup_progress_bar()
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
if settings.DEBUG_LOGFIRE:
from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
SQLite3Instrumentor().instrument()
import logfire
logfire.configure()
logfire.instrument_django(is_sql_commentor_enabled=True)
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
except KeyboardInterrupt:
raise SystemExit(2)
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = None

View file

@ -170,6 +170,7 @@ STATICFILES_DIRS = [
*[
str(plugin_dir / 'static')
for plugin_dir in PLUGIN_DIRS.values()
if (plugin_dir / 'static').is_dir()
],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
]
@ -179,6 +180,7 @@ TEMPLATE_DIRS = [
*[
str(plugin_dir / 'templates')
for plugin_dir in PLUGIN_DIRS.values()
if (plugin_dir / 'templates').is_dir()
],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),

View file

@ -141,18 +141,22 @@ SETTINGS_LOGGING = {
"api": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
"propagate": False,
},
"checks": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
"propagate": False,
},
"core": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
"propagate": False,
},
"plugins_extractor": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
"propagate": False,
},
"httpx": {
"handlers": ["outbound_webhooks"],
@ -164,6 +168,7 @@ SETTINGS_LOGGING = {
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
"propagate": False,
},
"django.utils.autoreload": {
"propagate": False,

View file

@ -230,7 +230,7 @@ def progress_bar(seconds: int, prefix: str='') -> None:
print()
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str):
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
@ -526,11 +526,11 @@ def log_removal_finished(all_links: int, to_remove: int):
def log_shell_welcome_msg():
from .cli import list_subcommands
from .cli import CLI_SUBCOMMANDS
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
print('{green}from cli import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
print('{green}from cli import *\n {}{reset}'.format("\n ".join(CLI_SUBCOMMANDS.keys()), **ANSI))
print()
print('[i] Welcome to the ArchiveBox Shell!')
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')

View file

@ -16,7 +16,7 @@ from django.db.models import QuerySet
from django.utils import timezone
from .cli import (
list_subcommands,
CLI_SUBCOMMANDS,
run_subcommand,
display_first,
meta_cmds,
@ -66,9 +66,9 @@ from .index.html import (
)
from .index.csv import links_to_csv
from .extractors import archive_links, archive_link, ignore_methods
from .misc.logging import stderr, hint
from .misc.checks import check_data_folder, check_dependencies
from .config import (
stderr,
hint,
ConfigDict,
ANSI,
IS_TTY,
@ -98,8 +98,6 @@ from .config import (
SEARCH_BACKEND_ENGINE,
LDAP,
get_version,
check_dependencies,
check_data_folder,
write_config_file,
VERSION,
VERSIONS_AVAILABLE,
@ -146,7 +144,7 @@ from .logging_util import (
def help(out_dir: Path=OUTPUT_DIR) -> None:
"""Print the ArchiveBox help message and usage"""
all_subcommands = list_subcommands()
all_subcommands = CLI_SUBCOMMANDS
COMMANDS_HELP_TEXT = '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in all_subcommands.items()
@ -281,7 +279,7 @@ def version(quiet: bool=False,
print('{white}[i] Data locations:{reset} (not in a data directory)'.format(**ANSI))
print()
check_dependencies()
check_dependencies(CONFIG)
@enforce_types
@ -469,7 +467,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
def status(out_dir: Path=OUTPUT_DIR) -> None:
"""Print out some info and statistics about the archive collection"""
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
from core.models import Snapshot
from django.contrib.auth import get_user_model
@ -609,8 +607,8 @@ def add(urls: Union[str, List[str]],
run_subcommand('init', stdin=None, pwd=out_dir)
# Load list of links from the existing index
check_data_folder(out_dir=out_dir)
check_dependencies()
check_data_folder(CONFIG)
check_dependencies(CONFIG)
new_links: List[Link] = []
all_links = load_main_index(out_dir=out_dir)
@ -705,7 +703,7 @@ def remove(filter_str: Optional[str]=None,
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Remove the specified URLs from the archive"""
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
if snapshots is None:
if filter_str and filter_patterns:
@ -792,8 +790,8 @@ def update(resume: Optional[float]=None,
from core.models import ArchiveResult
from .search import index_links
check_data_folder(out_dir=out_dir)
check_dependencies()
check_data_folder(CONFIG)
check_dependencies(CONFIG)
new_links: List[Link] = [] # TODO: Remove input argument: only_new
extractors = extractors.split(",") if extractors else []
@ -863,7 +861,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
out_dir: Path=OUTPUT_DIR) -> Iterable[Link]:
"""List, filter, and export information about archive entries"""
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
if filter_patterns and filter_patterns_str:
stderr(
@ -911,7 +909,7 @@ def list_links(snapshots: Optional[QuerySet]=None,
before: Optional[float]=None,
out_dir: Path=OUTPUT_DIR) -> Iterable[Link]:
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
if snapshots:
all_snapshots = snapshots
@ -935,7 +933,7 @@ def list_folders(links: List[Link],
status: str,
out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
STATUS_FUNCTIONS = {
"indexed": get_indexed_folders,
@ -1080,7 +1078,7 @@ def config(config_options_str: Optional[str]=None,
out_dir: Path=OUTPUT_DIR) -> None:
"""Get and set your ArchiveBox project configuration values"""
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
if config_options and config_options_str:
stderr(
@ -1183,7 +1181,7 @@ def schedule(add: bool=False,
out_dir: Path=OUTPUT_DIR):
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
Path(LOGS_DIR).mkdir(exist_ok=True)
@ -1324,7 +1322,7 @@ def server(runserver_args: Optional[List[str]]=None,
config.SHOW_PROGRESS = False
config.DEBUG = config.DEBUG or debug
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
from django.core.management import call_command
from django.contrib.auth.models import User
@ -1417,7 +1415,7 @@ def server(runserver_args: Optional[List[str]]=None,
def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
"""Run an ArchiveBox Django management command"""
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
@ -1432,7 +1430,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
def shell(out_dir: Path=OUTPUT_DIR) -> None:
"""Enter an interactive ArchiveBox Django shell"""
check_data_folder(out_dir=out_dir)
check_data_folder(CONFIG)
from django.core.management import call_command
call_command("shell_plus")

View file

@ -7,7 +7,7 @@ if __name__ == '__main__':
# versions of ./manage.py commands whenever possible. When that's not possible
# (e.g. makemigrations), you can comment out this check temporarily
allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs']
allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs', 'test']
if not any(cmd in sys.argv for cmd in allowed_commands):
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")

View file

159
archivebox/misc/checks.py Normal file
View file

@ -0,0 +1,159 @@
__package__ = 'archivebox.misc'
# TODO: migrate all of these to new plugantic/base_check.py Check system
import sys
from benedict import benedict
from pathlib import Path
from .logging import stderr, hint
def check_system_config(config: benedict) -> None:
### Check system environment
if config['USER'] == 'root' or str(config['PUID']) == "0":
stderr('[!] ArchiveBox should never be run as root!', color='red')
stderr(' For more information, see the security overview documentation:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
if config['IN_DOCKER']:
attempted_command = ' '.join(sys.argv[:3])
stderr('')
stderr(' {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI']))
stderr(f' docker compose run archivebox {attempted_command}')
stderr(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}')
stderr(' or:')
stderr(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"')
stderr(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"')
raise SystemExit(2)
### Check Python environment
if sys.version_info[:3] < (3, 7, 0):
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(2)
if int(config['DJANGO_VERSION'].split('.')[0]) < 3:
stderr(f'[X] Django version is not new enough: {config["DJANGO_VERSION"]} (>3.0 is required)', color='red')
stderr(' Upgrade django using pip or your system package manager: pip3 install --upgrade django')
raise SystemExit(2)
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
stderr('')
stderr(' Confirm that it\'s fixed by opening a new shell and running:')
stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
raise SystemExit(2)
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists():
if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
stderr(' For more info see:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
stderr()
stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
config['CHROME_USER_DATA_DIR'] = None
else:
config['CHROME_USER_DATA_DIR'] = None
def check_dependencies(config: benedict, show_help: bool=True) -> None:
invalid_dependencies = [
(name, info) for name, info in config['DEPENDENCIES'].items()
if info['enabled'] and not info['is_valid']
]
if invalid_dependencies and show_help:
stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
for dependency, info in invalid_dependencies:
stderr(
' ! {}: {} ({})'.format(
dependency,
info['path'] or 'unable to find binary',
info['version'] or 'unable to detect version',
)
)
if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
hint(('To install all packages automatically run: archivebox setup',
f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
''), prefix=' ')
stderr('')
if config['TIMEOUT'] < 5:
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
stderr()
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
stderr(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
stderr(' (Setting it somewhere over 60 seconds is recommended)')
stderr()
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
stderr()
def check_data_folder(config: benedict) -> None:
output_dir = config['OUTPUT_DIR']
archive_dir_exists = (Path(output_dir) / 'archive').exists()
if not archive_dir_exists:
stderr('[X] No archivebox index found in the current directory.', color='red')
stderr(f' {output_dir}', color='lightyellow')
stderr()
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]')
stderr()
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
stderr(' archivebox init')
raise SystemExit(2)
def check_migrations(config: benedict):
output_dir = config['OUTPUT_DIR']
from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
if pending_migrations:
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
stderr(f' {output_dir}')
stderr()
stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
stderr(' archivebox init')
raise SystemExit(3)
(Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True)
(Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True)
(Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True)
(Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True)
(Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True)

View file

@ -0,0 +1,30 @@
from functools import wraps
from time import time
def timed_function(func):
"""
Very simple profiling decorator for debugging.
Usage:
@timed_function
def my_func():
...
More advanced alternatives:
- viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html
- python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof
- Django Debug Toolbar + django-debug-toolbar-flamegraph
+ Django Requests Tracker (requests-tracker)
"""
@wraps(func)
def wrap(*args, **kwargs):
if args and hasattr(args[0], '__module__'):
module = args[0].__module__
else:
module = func.__module__
ts_start = time()
result = func(*args, **kwargs)
ts_end = time()
ms_elapsed = int((ts_end-ts_start) * 1000)
print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)')
return result
return wrap

View file

@ -0,0 +1,77 @@
__package__ = 'archivebox.misc'
# TODO: merge/dedupe this file with archivebox/logging_util.py
import os
import sys
from typing import Optional, Union, Tuple, List
from collections import defaultdict
from benedict import benedict
from rich.console import Console
from ..config_stubs import ConfigDict
SHOW_PROGRESS = None
if os.environ.get('SHOW_PROGRESS', 'None') in ('True', '1', 'true', 'yes'):
SHOW_PROGRESS = True
CONSOLE = Console(force_interactive=SHOW_PROGRESS)
SHOW_PROGRESS = CONSOLE.is_interactive if SHOW_PROGRESS is None else SHOW_PROGRESS
DEFAULT_CLI_COLORS = benedict(
{
"reset": "\033[00;00m",
"lightblue": "\033[01;30m",
"lightyellow": "\033[01;33m",
"lightred": "\033[01;35m",
"red": "\033[01;31m",
"green": "\033[01;32m",
"blue": "\033[01;34m",
"white": "\033[01;37m",
"black": "\033[01;30m",
}
)
ANSI = benedict({k: '' for k in DEFAULT_CLI_COLORS.keys()})
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
'00': [(0, 0, 0), (0, 0, 0)],
'30': [(0, 0, 0), (0, 0, 0)],
'31': [(255, 0, 0), (128, 0, 0)],
'32': [(0, 200, 0), (0, 128, 0)],
'33': [(255, 255, 0), (128, 128, 0)],
'34': [(0, 0, 255), (0, 0, 128)],
'35': [(255, 0, 255), (128, 0, 128)],
'36': [(0, 255, 255), (0, 128, 128)],
'37': [(255, 255, 255), (255, 255, 255)],
})
# Logging Helpers
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color:
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
else:
strs = [' '.join(str(a) for a in args), '\n']
sys.stdout.write(prefix + ''.join(strs))
def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color:
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
else:
strs = [' '.join(str(a) for a in args), '\n']
sys.stderr.write(prefix + ''.join(strs))
def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if isinstance(text, str):
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
else:
stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
for line in text[1:]:
stderr('{} {}'.format(prefix, line))

View file

@ -10,7 +10,6 @@ import datetime
from django.utils import timezone
timezone.utc = datetime.timezone.utc
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
# from signal_webhooks.apps import DjangoSignalWebhooksConfig
# DjangoSignalWebhooksConfig.verbose_name = 'API'

View file

@ -371,9 +371,9 @@
"license": "Apache-2.0"
},
"node_modules/bare-events": {
"version": "2.4.2",
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.4.2.tgz",
"integrity": "sha512-qMKFd2qG/36aA4GwvKq8MxnPgCQAmBWmSyLWsJcbn8v03wvIPQ/hG1Ms8bPzndZxMDoHpxez5VOS+gC9Yi24/Q==",
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.5.0.tgz",
"integrity": "sha512-/E8dDe9dsbLyh2qrZ64PEPadOQ0F4gbl1sUJOrmph7xOiIxfY8vwab/4bFLh4Y88/Hk/ujKcrQKc+ps0mv873A==",
"license": "Apache-2.0",
"optional": true
},

View file

@ -3,6 +3,7 @@ __package__ = "archivebox.plugantic"
from typing import Dict, List
from typing_extensions import Self
from benedict import benedict
from pydantic import Field, InstanceOf, validate_call
from pydantic_pkgr import (
Binary,
@ -17,7 +18,6 @@ from pydantic_pkgr import (
from django.conf import settings
from .base_hook import BaseHook, HookType
from ..config_stubs import AttrDict
class BaseBinProvider(BaseHook, BinProvider):
@ -38,7 +38,7 @@ class BaseBinProvider(BaseHook, BinProvider):
def register(self, settings, parent_plugin=None):
# self._plugin = parent_plugin # for debugging only, never rely on this!
settings.BINPROVIDERS = getattr(settings, "BINPROVIDERS", None) or AttrDict({})
settings.BINPROVIDERS = getattr(settings, "BINPROVIDERS", None) or benedict({})
settings.BINPROVIDERS[self.id] = self
super().register(settings, parent_plugin=parent_plugin)
@ -58,7 +58,7 @@ class BaseBinary(BaseHook, Binary):
def register(self, settings, parent_plugin=None):
# self._plugin = parent_plugin # for debugging only, never rely on this!
settings.BINARIES = getattr(settings, "BINARIES", None) or AttrDict({})
settings.BINARIES = getattr(settings, "BINARIES", None) or benedict({})
settings.BINARIES[self.id] = self
super().register(settings, parent_plugin=parent_plugin)

View file

@ -28,7 +28,7 @@ class BaseCheck(BaseHook):
def register(self, settings, parent_plugin=None):
# self._plugin = parent_plugin # backref to parent is for debugging only, never rely on this!
self.register_with_django_check_system() # (SIDE EFFECT)
self.register_with_django_check_system(settings) # (SIDE EFFECT)
# install hook into settings.CHECKS
settings.CHECKS = getattr(settings, "CHECKS", None) or AttrDict({})
@ -37,12 +37,9 @@ class BaseCheck(BaseHook):
# record installed hook in settings.HOOKS
super().register(settings, parent_plugin=parent_plugin)
def register_with_django_check_system(self):
def register_with_django_check_system(self, settings):
def run_check(app_configs, **kwargs) -> List[Warning]:
from django.conf import settings
import logging
return self.check(settings, logging.getLogger("checks"))
run_check.__name__ = self.id

View file

@ -96,14 +96,13 @@ class BaseHook(BaseModel):
# e.g. /admin/environment/config/LdapConfig/
return f"/admin/environment/{self.hook_type.lower()}/{self.id}/"
def register(self, settings, parent_plugin=None):
"""Load a record of an installed hook into global Django settings.HOOKS at runtime."""
self._plugin = parent_plugin # for debugging only, never rely on this!
# assert json.dumps(self.model_json_schema(), indent=4), f"Hook {self.hook_module} has invalid JSON schema."
print(' -', self.hook_module, '.register()')
# print(' -', self.hook_module, '.register()')
# record installed hook in settings.HOOKS
settings.HOOKS[self.id] = self
@ -118,7 +117,7 @@ class BaseHook(BaseModel):
def ready(self, settings):
"""Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
print(' -', self.hook_module, '.ready()')
# print(' -', self.hook_module, '.ready()')
assert self.id in settings.HOOKS, f"Tried to ready hook {self.hook_module} but it is not registered in settings.HOOKS."

View file

@ -1,6 +1,5 @@
__package__ = 'archivebox.plugantic'
import json
import inspect
from pathlib import Path
@ -18,10 +17,11 @@ from pydantic import (
computed_field,
validate_call,
)
from benedict import benedict
from .base_hook import BaseHook, HookType
from ..config import AttrDict
from ..config import bump_startup_progress_bar
class BasePlugin(BaseModel):
@ -90,7 +90,8 @@ class BasePlugin(BaseModel):
assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name'
assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
# assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
return self
@property
@ -114,13 +115,13 @@ class BasePlugin(BaseModel):
@property
def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]:
return AttrDict({hook.id: hook for hook in self.hooks})
return benedict({hook.id: hook for hook in self.hooks})
@property
def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]:
hooks = AttrDict({})
hooks = benedict({})
for hook in self.hooks:
hooks[hook.hook_type] = hooks.get(hook.hook_type) or AttrDict({})
hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({})
hooks[hook.hook_type][hook.id] = hook
return hooks
@ -131,10 +132,10 @@ class BasePlugin(BaseModel):
from django.conf import settings as django_settings
settings = django_settings
print()
print(self.plugin_module_full, '.register()')
# print()
# print(self.plugin_module_full, '.register()')
assert json.dumps(self.model_json_schema(), indent=4), f'Plugin {self.plugin_module} has invalid JSON schema.'
# assert json.dumps(self.model_json_schema(), indent=4), f'Plugin {self.plugin_module} has invalid JSON schema.'
assert self.id not in settings.PLUGINS, f'Tried to register plugin {self.plugin_module} but it conflicts with existing plugin of the same name ({self.app_label}).'
@ -149,6 +150,7 @@ class BasePlugin(BaseModel):
settings.PLUGINS[self.id]._is_registered = True
# print('√ REGISTERED PLUGIN:', self.plugin_module)
bump_startup_progress_bar()
def ready(self, settings=None):
"""Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
@ -157,8 +159,8 @@ class BasePlugin(BaseModel):
from django.conf import settings as django_settings
settings = django_settings
print()
print(self.plugin_module_full, '.ready()')
# print()
# print(self.plugin_module_full, '.ready()')
assert (
self.id in settings.PLUGINS and settings.PLUGINS[self.id]._is_registered
@ -171,6 +173,7 @@ class BasePlugin(BaseModel):
hook.ready(settings)
settings.PLUGINS[self.id]._is_ready = True
bump_startup_progress_bar()
# @validate_call
# def install_binaries(self) -> Self:

View file

@ -83,338 +83,3 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
# for computed_field properties render them like this instead:
# inspect.getsource(field.wrapped_property.fget).split('def ', 1)[-1].split('\n', 1)[-1].strip().strip('return '),
### Basic Assertions
# test_input = """
# [SERVER_CONFIG]
# IS_TTY=False
# USE_COLOR=False
# SHOW_PROGRESS=False
# IN_DOCKER=False
# IN_QEMU=False
# PUID=501
# PGID=20
# OUTPUT_DIR=/opt/archivebox/data
# CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
# ONLY_NEW=True
# TIMEOUT=60
# MEDIA_TIMEOUT=3600
# OUTPUT_PERMISSIONS=644
# RESTRICT_FILE_NAMES=windows
# URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
# URL_ALLOWLIST=None
# ADMIN_USERNAME=None
# ADMIN_PASSWORD=None
# ENFORCE_ATOMIC_WRITES=True
# TAG_SEPARATOR_PATTERN=[,]
# SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# BIND_ADDR=127.0.0.1:8000
# ALLOWED_HOSTS=*
# DEBUG=False
# PUBLIC_INDEX=True
# PUBLIC_SNAPSHOTS=True
# PUBLIC_ADD_VIEW=False
# FOOTER_INFO=Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
# SNAPSHOTS_PER_PAGE=40
# CUSTOM_TEMPLATES_DIR=None
# TIME_ZONE=UTC
# TIMEZONE=UTC
# REVERSE_PROXY_USER_HEADER=Remote-User
# REVERSE_PROXY_WHITELIST=
# LOGOUT_REDIRECT_URL=/
# PREVIEW_ORIGINALS=True
# LDAP=False
# LDAP_SERVER_URI=None
# LDAP_BIND_DN=None
# LDAP_BIND_PASSWORD=None
# LDAP_USER_BASE=None
# LDAP_USER_FILTER=None
# LDAP_USERNAME_ATTR=None
# LDAP_FIRSTNAME_ATTR=None
# LDAP_LASTNAME_ATTR=None
# LDAP_EMAIL_ATTR=None
# LDAP_CREATE_SUPERUSER=False
# SAVE_TITLE=True
# SAVE_FAVICON=True
# SAVE_WGET=True
# SAVE_WGET_REQUISITES=True
# SAVE_SINGLEFILE=True
# SAVE_READABILITY=True
# SAVE_MERCURY=True
# SAVE_HTMLTOTEXT=True
# SAVE_PDF=True
# SAVE_SCREENSHOT=True
# SAVE_DOM=True
# SAVE_HEADERS=True
# SAVE_WARC=True
# SAVE_GIT=True
# SAVE_MEDIA=True
# SAVE_ARCHIVE_DOT_ORG=True
# RESOLUTION=1440,2000
# GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
# CHECK_SSL_VALIDITY=True
# MEDIA_MAX_SIZE=750m
# USER_AGENT=None
# CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
# WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
# CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
# COOKIES_FILE=None
# CHROME_USER_DATA_DIR=None
# CHROME_TIMEOUT=0
# CHROME_HEADLESS=True
# CHROME_SANDBOX=True
# CHROME_EXTRA_ARGS=[]
# YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
# YOUTUBEDL_EXTRA_ARGS=[]
# WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
# WGET_EXTRA_ARGS=[]
# CURL_ARGS=['--silent', '--location', '--compressed']
# CURL_EXTRA_ARGS=[]
# GIT_ARGS=['--recursive']
# SINGLEFILE_ARGS=[]
# SINGLEFILE_EXTRA_ARGS=[]
# MERCURY_ARGS=['--format=text']
# MERCURY_EXTRA_ARGS=[]
# FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
# USE_INDEXING_BACKEND=True
# USE_SEARCHING_BACKEND=True
# SEARCH_BACKEND_ENGINE=ripgrep
# SEARCH_BACKEND_HOST_NAME=localhost
# SEARCH_BACKEND_PORT=1491
# SEARCH_BACKEND_PASSWORD=SecretPassword
# SEARCH_PROCESS_HTML=True
# SONIC_COLLECTION=archivebox
# SONIC_BUCKET=snapshots
# SEARCH_BACKEND_TIMEOUT=90
# FTS_SEPARATE_DATABASE=True
# FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
# FTS_SQLITE_MAX_LENGTH=1000000000
# USE_CURL=True
# USE_WGET=True
# USE_SINGLEFILE=True
# USE_READABILITY=True
# USE_MERCURY=True
# USE_GIT=True
# USE_CHROME=True
# USE_NODE=True
# USE_YOUTUBEDL=True
# USE_RIPGREP=True
# CURL_BINARY=curl
# GIT_BINARY=git
# WGET_BINARY=wget
# SINGLEFILE_BINARY=single-file
# READABILITY_BINARY=readability-extractor
# MERCURY_BINARY=postlight-parser
# YOUTUBEDL_BINARY=yt-dlp
# NODE_BINARY=node
# RIPGREP_BINARY=rg
# CHROME_BINARY=chrome
# POCKET_CONSUMER_KEY=None
# USER=squash
# PACKAGE_DIR=/opt/archivebox/archivebox
# TEMPLATES_DIR=/opt/archivebox/archivebox/templates
# ARCHIVE_DIR=/opt/archivebox/data/archive
# SOURCES_DIR=/opt/archivebox/data/sources
# LOGS_DIR=/opt/archivebox/data/logs
# PERSONAS_DIR=/opt/archivebox/data/personas
# URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
# URL_ALLOWLIST_PTN=None
# DIR_OUTPUT_PERMISSIONS=755
# ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
# VERSION=0.8.0
# COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
# BUILD_TIME=2024-05-15 03:28:05 1715768885
# VERSIONS_AVAILABLE=None
# CAN_UPGRADE=False
# PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
# PYTHON_ENCODING=UTF-8
# PYTHON_VERSION=3.10.14
# DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
# DJANGO_VERSION=5.0.6 final (0)
# SQLITE_BINARY=/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
# SQLITE_VERSION=2.6.0
# CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
# WGET_VERSION=GNU Wget 1.24.5
# WGET_AUTO_COMPRESSION=True
# RIPGREP_VERSION=ripgrep 14.1.0
# SINGLEFILE_VERSION=None
# READABILITY_VERSION=None
# MERCURY_VERSION=None
# GIT_VERSION=git version 2.44.0
# YOUTUBEDL_VERSION=2024.04.09
# CHROME_VERSION=Google Chrome 124.0.6367.207
# NODE_VERSION=v21.7.3
# """
# expected_output = TOML_HEADER + '''[SERVER_CONFIG]
# IS_TTY = false
# USE_COLOR = false
# SHOW_PROGRESS = false
# IN_DOCKER = false
# IN_QEMU = false
# PUID = 501
# PGID = 20
# OUTPUT_DIR = "/opt/archivebox/data"
# CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
# ONLY_NEW = true
# TIMEOUT = 60
# MEDIA_TIMEOUT = 3600
# OUTPUT_PERMISSIONS = 644
# RESTRICT_FILE_NAMES = "windows"
# URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
# URL_ALLOWLIST = null
# ADMIN_USERNAME = null
# ADMIN_PASSWORD = null
# ENFORCE_ATOMIC_WRITES = true
# TAG_SEPARATOR_PATTERN = "[,]"
# SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# BIND_ADDR = "127.0.0.1:8000"
# ALLOWED_HOSTS = "*"
# DEBUG = false
# PUBLIC_INDEX = true
# PUBLIC_SNAPSHOTS = true
# PUBLIC_ADD_VIEW = false
# FOOTER_INFO = "Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
# SNAPSHOTS_PER_PAGE = 40
# CUSTOM_TEMPLATES_DIR = null
# TIME_ZONE = "UTC"
# TIMEZONE = "UTC"
# REVERSE_PROXY_USER_HEADER = "Remote-User"
# REVERSE_PROXY_WHITELIST = ""
# LOGOUT_REDIRECT_URL = "/"
# PREVIEW_ORIGINALS = true
# LDAP = false
# LDAP_SERVER_URI = null
# LDAP_BIND_DN = null
# LDAP_BIND_PASSWORD = null
# LDAP_USER_BASE = null
# LDAP_USER_FILTER = null
# LDAP_USERNAME_ATTR = null
# LDAP_FIRSTNAME_ATTR = null
# LDAP_LASTNAME_ATTR = null
# LDAP_EMAIL_ATTR = null
# LDAP_CREATE_SUPERUSER = false
# SAVE_TITLE = true
# SAVE_FAVICON = true
# SAVE_WGET = true
# SAVE_WGET_REQUISITES = true
# SAVE_SINGLEFILE = true
# SAVE_READABILITY = true
# SAVE_MERCURY = true
# SAVE_HTMLTOTEXT = true
# SAVE_PDF = true
# SAVE_SCREENSHOT = true
# SAVE_DOM = true
# SAVE_HEADERS = true
# SAVE_WARC = true
# SAVE_GIT = true
# SAVE_MEDIA = true
# SAVE_ARCHIVE_DOT_ORG = true
# RESOLUTION = [1440, 2000]
# GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
# CHECK_SSL_VALIDITY = true
# MEDIA_MAX_SIZE = "750m"
# USER_AGENT = null
# CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
# WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
# CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
# COOKIES_FILE = null
# CHROME_USER_DATA_DIR = null
# CHROME_TIMEOUT = false
# CHROME_HEADLESS = true
# CHROME_SANDBOX = true
# CHROME_EXTRA_ARGS = []
# YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
# YOUTUBEDL_EXTRA_ARGS = []
# WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
# WGET_EXTRA_ARGS = []
# CURL_ARGS = ["--silent", "--location", "--compressed"]
# CURL_EXTRA_ARGS = []
# GIT_ARGS = ["--recursive"]
# SINGLEFILE_ARGS = []
# SINGLEFILE_EXTRA_ARGS = []
# MERCURY_ARGS = ["--format=text"]
# MERCURY_EXTRA_ARGS = []
# FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
# USE_INDEXING_BACKEND = true
# USE_SEARCHING_BACKEND = true
# SEARCH_BACKEND_ENGINE = "ripgrep"
# SEARCH_BACKEND_HOST_NAME = "localhost"
# SEARCH_BACKEND_PORT = 1491
# SEARCH_BACKEND_PASSWORD = "SecretPassword"
# SEARCH_PROCESS_HTML = true
# SONIC_COLLECTION = "archivebox"
# SONIC_BUCKET = "snapshots"
# SEARCH_BACKEND_TIMEOUT = 90
# FTS_SEPARATE_DATABASE = true
# FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
# FTS_SQLITE_MAX_LENGTH = 1000000000
# USE_CURL = true
# USE_WGET = true
# USE_SINGLEFILE = true
# USE_READABILITY = true
# USE_MERCURY = true
# USE_GIT = true
# USE_CHROME = true
# USE_NODE = true
# USE_YOUTUBEDL = true
# USE_RIPGREP = true
# CURL_BINARY = "curl"
# GIT_BINARY = "git"
# WGET_BINARY = "wget"
# SINGLEFILE_BINARY = "single-file"
# READABILITY_BINARY = "readability-extractor"
# MERCURY_BINARY = "postlight-parser"
# YOUTUBEDL_BINARY = "yt-dlp"
# NODE_BINARY = "node"
# RIPGREP_BINARY = "rg"
# CHROME_BINARY = "chrome"
# POCKET_CONSUMER_KEY = null
# USER = "squash"
# PACKAGE_DIR = "/opt/archivebox/archivebox"
# TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
# ARCHIVE_DIR = "/opt/archivebox/data/archive"
# SOURCES_DIR = "/opt/archivebox/data/sources"
# LOGS_DIR = "/opt/archivebox/data/logs"
# PERSONAS_DIR = "/opt/archivebox/data/personas"
# URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
# URL_ALLOWLIST_PTN = null
# DIR_OUTPUT_PERMISSIONS = 755
# ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
# VERSION = "0.8.0"
# COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
# BUILD_TIME = "2024-05-15 03:28:05 1715768885"
# VERSIONS_AVAILABLE = null
# CAN_UPGRADE = false
# PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
# PYTHON_ENCODING = "UTF-8"
# PYTHON_VERSION = "3.10.14"
# DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
# DJANGO_VERSION = "5.0.6 final (0)"
# SQLITE_BINARY = "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
# SQLITE_VERSION = "2.6.0"
# CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
# WGET_VERSION = "GNU Wget 1.24.5"
# WGET_AUTO_COMPRESSION = true
# RIPGREP_VERSION = "ripgrep 14.1.0"
# SINGLEFILE_VERSION = null
# READABILITY_VERSION = null
# MERCURY_VERSION = null
# GIT_VERSION = "git version 2.44.0"
# YOUTUBEDL_VERSION = "2024.04.09"
# CHROME_VERSION = "Google Chrome 124.0.6367.207"
# NODE_VERSION = "v21.7.3"'''
# first_output = convert(test_input) # make sure ini -> toml parses correctly
# second_output = convert(first_output) # make sure toml -> toml parses/dumps consistently
# assert first_output == second_output == expected_output # make sure parsing is indempotent
# # DEBUGGING
# import sys
# import difflib
# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
# print(repr(second_output))

View file

@ -1,3 +1,5 @@
__package__ = 'archivebox.plugins_extractor.chrome'
import platform
from pathlib import Path
from typing import List, Optional, Dict, ClassVar
@ -77,40 +79,16 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
###################### Config ##########################
class ChromeDependencyConfigs(BaseConfigSet):
class ChromeConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
CHROME_BINARY: str = Field(default='chrome')
CHROME_ARGS: Optional[List[str]] = Field(default=None)
CHROME_EXTRA_ARGS: List[str] = []
CHROME_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
# def load(self) -> Self:
# # for each field in the model, load its value
# # load from each source in order of precedence (lowest to highest):
# # - schema default
# # - ArchiveBox.conf INI file
# # - environment variables
# # - command-line arguments
# LOADED_VALUES: Dict[str, Any] = {}
CHROME_BINARY: str = Field(default='chrome')
CHROME_ARGS: List[str] | None = Field(default=None)
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
CHROME_DEFAULT_ARGS: List[str] = Field(default=lambda: ['--timeout={TIMEOUT-10}'])
# for field_name, field in self.__fields__.items():
# def_value = field.default_factory() if field.default_factory else field.default
# ini_value = settings.INI_CONFIG.get_value(field_name)
# env_value = settings.ENV_CONFIG.get_value(field_name)
# cli_value = settings.CLI_CONFIG.get_value(field_name)
# run_value = settings.RUN_CONFIG.get_value(field_name)
# value = run_value or cli_value or env_value or ini_value or def_value
class ChromeConfigs(ChromeDependencyConfigs):
# section: ConfigSectionName = 'ALL_CONFIGS'
pass
DEFAULT_GLOBAL_CONFIG = {
}
CHROME_CONFIG = ChromeConfigs(**DEFAULT_GLOBAL_CONFIG)
CHROME_CONFIG = ChromeConfig()
class ChromeBinary(BaseBinary):
@ -133,6 +111,7 @@ class ChromeBinary(BaseBinary):
def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
if not (binary.abspath and binary.abspath.exists()):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
@ -146,7 +125,6 @@ class ChromeBinary(BaseBinary):
CHROME_BINARY = ChromeBinary()
PLUGIN_BINARIES = [CHROME_BINARY]
class ChromePlugin(BasePlugin):
app_label: str = 'chrome'

View file

@ -149,6 +149,7 @@ class CheckUserIsNotRoot(BaseCheck):
)
logger.debug('[√] UID is not root')
return errors
class CheckPipEnvironment(BaseCheck):
label: str = "CheckPipEnvironment"

View file

@ -14,7 +14,7 @@ from .utils import get_indexable_content, log_index_started
def import_backend():
for backend in settings.SEARCH_BACKENDS:
for backend in settings.SEARCH_BACKENDS.values():
if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE:
return backend
raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend')