diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 045650ff..167c13f0 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -51,6 +51,7 @@ from .pkgs import load_vendored_pkgs # noqa load_vendored_pkgs() # print('DONE LOADING VENDORED LIBRARIES') +# print('LOADING ABX PLUGIN SPECIFICATIONS') # Load ABX Plugin Specifications + Default Implementations import abx # noqa import abx_spec_archivebox # noqa @@ -74,7 +75,7 @@ abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC()) # Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm) pm = abx.pm - +# print('DONE LOADING ABX PLUGIN SPECIFICATIONS') # Load all pip-installed ABX-compatible plugins ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx') @@ -94,7 +95,9 @@ USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins') # Import all plugins and register them with ABX Plugin Manager ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS} +# print('LOADING ALL PLUGINS') LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS) +# print('DONE LOADING ALL PLUGINS') # Setup basic config, constants, paths, and version from .config.constants import CONSTANTS # noqa diff --git a/archivebox/__main__.py b/archivebox/__main__.py index 1b6ea657..2d75ebef 100755 --- a/archivebox/__main__.py +++ b/archivebox/__main__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""This is the main entry point for the ArchiveBox CLI.""" +"""This is the entrypoint for python -m archivebox ...""" __package__ = 'archivebox' import archivebox # noqa # make sure monkey patches are applied before anything else @@ -15,5 +15,4 @@ ASCII_LOGO_MINI = r""" /_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\ """ -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index f5168929..15e8a984 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -6,13 +6,6 @@ from enum import Enum from ninja import Router, Schema -from archivebox.main import ( - add, - remove, - update, - list_all, - schedule, -) from archivebox.misc.util import ansi_to_html from archivebox.config.common import ARCHIVING_CONFIG @@ -60,13 +53,11 @@ class AddCommandSchema(Schema): urls: List[str] tag: str = "" depth: int = 0 - update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW - update_all: bool = False - index_only: bool = False - overwrite: bool = False - init: bool = False - extractors: str = "" parser: str = "auto" + extract: str = "" + update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW + overwrite: bool = False + index_only: bool = False class UpdateCommandSchema(Schema): resume: Optional[float] = 0 @@ -93,7 +84,7 @@ class ScheduleCommandSchema(Schema): class ListCommandSchema(Schema): filter_patterns: Optional[List[str]] = ['https://example.com'] filter_type: str = FilterTypeChoices.substring - status: Optional[StatusChoices] = StatusChoices.indexed + status: StatusChoices = StatusChoices.indexed after: Optional[float] = 0 before: Optional[float] = 999999999999999 sort: str = 'bookmarked_at' @@ -115,16 +106,16 @@ class RemoveCommandSchema(Schema): @router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]') def cli_add(request, args: AddCommandSchema): + from archivebox.cli.archivebox_add import add + result = add( urls=args.urls, tag=args.tag, depth=args.depth, update=args.update, - update_all=args.update_all, index_only=args.index_only, overwrite=args.overwrite, - init=args.init, - extractors=args.extractors, + extract=args.extract, parser=args.parser, ) @@ -139,6 +130,8 @@ def cli_add(request, args: AddCommandSchema): @router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]') def cli_update(request, args: UpdateCommandSchema): + from archivebox.cli.archivebox_update import update + result = update( resume=args.resume, only_new=args.only_new, @@ -162,6 +155,8 @@ def cli_update(request, args: UpdateCommandSchema): @router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]') def cli_schedule(request, args: ScheduleCommandSchema): + from archivebox.cli.archivebox_schedule import schedule + result = schedule( import_path=args.import_path, add=args.add, @@ -184,9 +179,11 @@ def cli_schedule(request, args: ScheduleCommandSchema): -@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns] (use this endpoint with ?filter_type=search to search for snapshots)') -def cli_list(request, args: ListCommandSchema): - result = list_all( +@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]') +def cli_search(request, args: ListCommandSchema): + from archivebox.cli.archivebox_search import search + + result = search( filter_patterns=args.filter_patterns, filter_type=args.filter_type, status=args.status, @@ -221,6 +218,8 @@ def cli_list(request, args: ListCommandSchema): @router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]') def cli_remove(request, args: RemoveCommandSchema): + from archivebox.cli.archivebox_remove import remove + result = remove( yes=True, # no way to interactively ask for confirmation via API, so we force yes delete=args.delete, diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 8649063f..c3cdc742 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -1,264 +1,117 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox' - import os import sys -import argparse -import threading - -from time import sleep -from collections.abc import Mapping - -from rich import print - -from typing import Optional, List, IO, Union, Iterable -from pathlib import Path - from importlib import import_module -BUILTIN_LIST = list +import rich_click as click +from rich import print + +from archivebox.config.version import VERSION -CLI_DIR = Path(__file__).resolve().parent -# rewrite setup -> install for backwards compatibility -if len(sys.argv) > 1 and sys.argv[1] == 'setup': - from rich import print - print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead') - sys.argv[1] = 'install' if '--debug' in sys.argv: os.environ['DEBUG'] = 'True' sys.argv.remove('--debug') -# def list_subcommands() -> Dict[str, str]: -# """find and import all valid archivebox_.py files in CLI_DIR""" -# COMMANDS = [] -# for filename in os.listdir(CLI_DIR): -# if is_cli_module(filename): -# subcommand = filename.replace('archivebox_', '').replace('.py', '') -# module = import_module('.archivebox_{}'.format(subcommand), __package__) -# assert is_valid_cli_module(module, subcommand) -# COMMANDS.append((subcommand, module.main.__doc__)) -# globals()[subcommand] = module.main -# display_order = lambda cmd: ( -# display_first.index(cmd[0]) -# if cmd[0] in display_first else -# 100 + len(cmd[0]) -# ) -# return dict(sorted(COMMANDS, key=display_order)) - -# just define it statically, it's much faster: -SUBCOMMAND_MODULES = { - 'help': 'archivebox_help', - 'version': 'archivebox_version' , +class ArchiveBoxGroup(click.Group): + """lazy loading click group for archivebox commands""" + meta_commands = { + 'help': 'archivebox.cli.archivebox_help.main', + 'version': 'archivebox.cli.archivebox_version.main', + } + setup_commands = { + 'init': 'archivebox.cli.archivebox_init.main', + 'install': 'archivebox.cli.archivebox_install.main', + } + archive_commands = { + 'add': 'archivebox.cli.archivebox_add.main', + 'remove': 'archivebox.cli.archivebox_remove.main', + 'update': 'archivebox.cli.archivebox_update.main', + 'search': 'archivebox.cli.archivebox_search.main', + 'status': 'archivebox.cli.archivebox_status.main', + 'config': 'archivebox.cli.archivebox_config.main', + 'schedule': 'archivebox.cli.archivebox_schedule.main', + 'server': 'archivebox.cli.archivebox_server.main', + 'shell': 'archivebox.cli.archivebox_shell.main', + 'manage': 'archivebox.cli.archivebox_manage.main', + } + all_subcommands = { + **meta_commands, + **setup_commands, + **archive_commands, + } + renamed_commands = { + 'setup': 'install', + 'list': 'search', + 'import': 'add', + 'archive': 'add', + 'export': 'search', + } - 'init': 'archivebox_init', - 'install': 'archivebox_install', - ############################################## - 'config': 'archivebox_config', - 'add': 'archivebox_add', - 'remove': 'archivebox_remove', - 'update': 'archivebox_update', - 'list': 'archivebox_list', - 'status': 'archivebox_status', + + def get_command(self, ctx, cmd_name): + # handle renamed commands + if cmd_name in self.renamed_commands: + new_name = self.renamed_commands[cmd_name] + print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`') + cmd_name = new_name + ctx.invoked_subcommand = cmd_name + + # handle lazy loading of commands + if cmd_name in self.all_subcommands: + return self._lazy_load(cmd_name) + + # fall-back to using click's default command lookup + return super().get_command(ctx, cmd_name) + + @classmethod + def _lazy_load(cls, cmd_name): + import_path = cls.all_subcommands[cmd_name] + modname, funcname = import_path.rsplit('.', 1) + + # print(f'LAZY LOADING {import_path}') + mod = import_module(modname) + func = getattr(mod, funcname) + + if not hasattr(func, '__doc__'): + raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method') + + # if not isinstance(cmd, click.BaseCommand): + # raise ValueError(f'lazy loading of {import_path} failed - not a click command') + + return func + + +@click.group(cls=ArchiveBoxGroup, invoke_without_command=True) +@click.option('--help', '-h', is_flag=True, help='Show help') +@click.version_option(version=VERSION, package_name='archivebox', message='%(version)s') +@click.pass_context +def cli(ctx, help=False): + """ArchiveBox: The self-hosted internet archive""" - 'schedule': 'archivebox_schedule', - 'server': 'archivebox_server', - 'shell': 'archivebox_shell', - 'manage': 'archivebox_manage', - - # 'oneshot': 'archivebox_oneshot', -} - -# every imported command module must have these properties in order to be valid -required_attrs = ('__package__', '__command__', 'main') - -# basic checks to make sure imported files are valid subcommands -is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py') -is_valid_cli_module = lambda module, subcommand: ( - all(hasattr(module, attr) for attr in required_attrs) - and module.__command__.split(' ')[-1] == subcommand -) - -class LazySubcommands(Mapping): - def keys(self): - return SUBCOMMAND_MODULES.keys() + if help or ctx.invoked_subcommand is None: + ctx.invoke(ctx.command.get_command(ctx, 'help')) - def values(self): - return [self[key] for key in self.keys()] - - def items(self): - return [(key, self[key]) for key in self.keys()] - - def __getitem__(self, key): - module = import_module(f'.{SUBCOMMAND_MODULES[key]}', __package__) - assert is_valid_cli_module(module, key) - return module.main - - def __iter__(self): - return iter(SUBCOMMAND_MODULES.keys()) - - def __len__(self): - return len(SUBCOMMAND_MODULES) + if ctx.invoked_subcommand in ArchiveBoxGroup.archive_commands: + # print('SETUP DJANGO AND CHECK DATA FOLDER') + from archivebox.config.django import setup_django + from archivebox.misc.checks import check_data_folder + setup_django() + check_data_folder() -CLI_SUBCOMMANDS = LazySubcommands() - - -# these common commands will appear sorted before any others for ease-of-use -meta_cmds = ('help', 'version') # dont require valid data folder at all -setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet -archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present -fake_db = ("oneshot",) # use fake in-memory db - -display_first = (*meta_cmds, *setup_cmds, *archive_cmds) - - -IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting - - -def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int: - """ - Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks. - Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes. - """ - - wait_for_all: bool = thread_names == () - - thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns) - - should_wait = lambda thread: ( - not thread_matches(thread, ignore_names) - and (wait_for_all or thread_matches(thread, thread_names))) - - for tries in range(timeout): - all_threads = [*threading.enumerate()] - blocking_threads = [*filter(should_wait, all_threads)] - threads_summary = ', '.join(repr(t) for t in blocking_threads) - if blocking_threads: - sleep(1) - if tries == 5: # only show stderr message if we need to wait more than 5s - print( - f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...', - threads_summary, - file=sys.stderr, - ) - else: - return tries - - raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}') - - - -def run_subcommand(subcommand: str, - subcommand_args: List[str] | None = None, - stdin: Optional[IO]=None, - pwd: Union[Path, str, None]=None) -> None: - """Run a given ArchiveBox subcommand with the given list of args""" - - subcommand_args = subcommand_args or [] - - from archivebox.misc.checks import check_migrations - from archivebox.config.django import setup_django - - # print('DATA_DIR is', DATA_DIR) - # print('pwd is', os.getcwd()) - - cmd_requires_db = (subcommand in archive_cmds) - init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args - - check_db = cmd_requires_db and not init_pending - - setup_django(in_memory_db=subcommand in fake_db, check_db=check_db) - - for ignore_pattern in ('help', '-h', '--help', 'version', '--version'): - if ignore_pattern in sys.argv[:4]: - cmd_requires_db = False - break - - if subcommand in archive_cmds: - if cmd_requires_db: - check_migrations() - - module = import_module('.archivebox_{}'.format(subcommand), __package__) - module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore - - # wait for webhooks, signals, and other background jobs to finish before exit - wait_for_bg_threads_to_exit(timeout=60) - - - - - -class NotProvided: - def __len__(self): - return 0 - def __bool__(self): - return False - def __repr__(self): - return '' - -Omitted = Union[None, NotProvided] - -OMITTED = NotProvided() - - -def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: str | None=None) -> None: - # print('STARTING CLI MAIN ENTRYPOINT') - - args = sys.argv[1:] if args is OMITTED else args - stdin = sys.stdin if stdin is OMITTED else stdin - - parser = argparse.ArgumentParser( - prog=__command__, - description='ArchiveBox: The self-hosted internet archive', - add_help=False, - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--help', '-h', - action='store_true', - help=CLI_SUBCOMMANDS['help'].__doc__, - ) - group.add_argument( - '--version', - action='store_true', - help=CLI_SUBCOMMANDS['version'].__doc__, - ) - group.add_argument( - "subcommand", - type=str, - help= "The name of the subcommand to run", - nargs='?', - choices=CLI_SUBCOMMANDS.keys(), - default=None, - ) - parser.add_argument( - "subcommand_args", - help="Arguments for the subcommand", - nargs=argparse.REMAINDER, - ) - command = parser.parse_args(args or ()) - - if command.version: - command.subcommand = 'version' - elif command.help or command.subcommand is None: - command.subcommand = 'help' - - if command.subcommand not in ('version',): - from archivebox.misc.logging_util import log_cli_command - - log_cli_command( - subcommand=command.subcommand, - subcommand_args=command.subcommand_args, - stdin=stdin or None, - ) +def main(args=None, prog_name=None): + # show `docker run archivebox xyz` in help messages if running in docker + IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') + prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox') try: - run_subcommand( - subcommand=command.subcommand, - subcommand_args=command.subcommand_args, - stdin=stdin or None, - ) + cli(args=args, prog_name=prog_name) except KeyboardInterrupt: print('\n\n[red][X] Got CTRL+C. Exiting...[/red]') + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 095539d3..1457925c 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -4,10 +4,10 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox add' import sys -import argparse -from typing import IO, TYPE_CHECKING +from typing import TYPE_CHECKING +import rich_click as click from django.utils import timezone from django.db.models import QuerySet @@ -18,7 +18,6 @@ from archivebox.config.common import ARCHIVING_CONFIG from archivebox.config.django import setup_django from archivebox.config.permissions import USER, HOSTNAME from archivebox.misc.checks import check_data_folder -from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr from archivebox.parsers import PARSERS @@ -29,22 +28,142 @@ if TYPE_CHECKING: ORCHESTRATOR = None +# OLD VERSION: +# def add(urls: Union[str, List[str]], +# tag: str='', +# depth: int=0, +# update: bool=not ARCHIVING_CONFIG.ONLY_NEW, +# update_all: bool=False, +# index_only: bool=False, +# overwrite: bool=False, +# # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically +# init: bool=False, +# extractors: str="", +# parser: str="auto", +# created_by_id: int | None=None, +# out_dir: Path=DATA_DIR) -> List[Link]: +# """Add a new URL or list of URLs to your archive""" + +# from core.models import Snapshot, Tag +# # from workers.supervisord_util import start_cli_workers, tail_worker_logs +# # from workers.tasks import bg_archive_link + + +# assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' + +# extractors = extractors.split(",") if extractors else [] + +# if init: +# run_subcommand('init', stdin=None, pwd=out_dir) + +# # Load list of links from the existing index +# check_data_folder() + +# # worker = start_cli_workers() + +# new_links: List[Link] = [] +# all_links = load_main_index(out_dir=out_dir) + +# log_importing_started(urls=urls, depth=depth, index_only=index_only) +# if isinstance(urls, str): +# # save verbatim stdin to sources +# write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir) +# elif isinstance(urls, list): +# # save verbatim args to sources +# write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) + + +# new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser) + +# # If we're going one level deeper, download each link and look for more links +# new_links_depth = [] +# if new_links and depth == 1: +# log_crawl_started(new_links) +# for new_link in new_links: +# try: +# downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) +# new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) +# except Exception as err: +# stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red') + +# imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) + +# new_links = dedupe_links(all_links, imported_links) + +# write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id) +# all_links = load_main_index(out_dir=out_dir) + +# tags = [ +# Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0] +# for name in tag.split(',') +# if name.strip() +# ] +# if tags: +# for link in imported_links: +# snapshot = Snapshot.objects.get(url=link.url) +# snapshot.tags.add(*tags) +# snapshot.tags_str(nocache=True) +# snapshot.save() +# # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}') + +# if index_only: +# # mock archive all the links using the fake index_only extractor method in order to update their state +# if overwrite: +# archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id) +# else: +# archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id) +# else: +# # fully run the archive extractor methods for each link +# archive_kwargs = { +# "out_dir": out_dir, +# "created_by_id": created_by_id, +# } +# if extractors: +# archive_kwargs["methods"] = extractors + +# stderr() + +# ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S') + +# if update: +# stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green') +# archive_links(imported_links, overwrite=overwrite, **archive_kwargs) +# elif update_all: +# stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green') +# archive_links(all_links, overwrite=overwrite, **archive_kwargs) +# elif overwrite: +# stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green') +# archive_links(imported_links, overwrite=True, **archive_kwargs) +# elif new_links: +# stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green') +# archive_links(new_links, overwrite=False, **archive_kwargs) + +# # tail_worker_logs(worker['stdout_logfile']) + +# # if CAN_UPGRADE: +# # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") + +# return new_links + + + def add(urls: str | list[str], - tag: str='', depth: int=0, - update: bool=not ARCHIVING_CONFIG.ONLY_NEW, - update_all: bool=False, - index_only: bool=False, - overwrite: bool=False, - extractors: str="", + tag: str='', parser: str="auto", + extract: str="", persona: str='Default', + overwrite: bool=False, + update: bool=not ARCHIVING_CONFIG.ONLY_NEW, + index_only: bool=False, bg: bool=False, created_by_id: int | None=None) -> QuerySet['Snapshot']: """Add a new URL or list of URLs to your archive""" global ORCHESTRATOR + depth = int(depth) + assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' # 0. setup abx, django, check_data_folder @@ -56,7 +175,6 @@ def add(urls: str | list[str], from archivebox.base_models.models import get_or_create_system_user_pk - created_by_id = created_by_id or get_or_create_system_user_pk() # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt @@ -72,7 +190,7 @@ def add(urls: str | list[str], 'ONLY_NEW': not update, 'INDEX_ONLY': index_only, 'OVERWRITE': overwrite, - 'EXTRACTORS': extractors, + 'EXTRACTORS': extract, 'DEFAULT_PERSONA': persona or 'Default', }) # 3. create a new Crawl pointing to the Seed @@ -91,118 +209,23 @@ def add(urls: str | list[str], return crawl.snapshot_set.all() -def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=None) -> None: +@click.command() +@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away') +@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3') +@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs') +@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...') +@click.option('--persona', default='Default', help='Authentication profile to use when archiving') +@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously') +@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them') +@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now') +# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones') +@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately') +@click.argument('urls', nargs=-1, type=click.Path()) +def main(**kwargs): """Add a new URL or list of URLs to your archive""" - parser = argparse.ArgumentParser( - prog=__command__, - description=add.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--tag', '-t', - type=str, - default='', - help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3", - ) - parser.add_argument( - '--update', #'-u', - action='store_true', - default=not ARCHIVING_CONFIG.ONLY_NEW, # when ONLY_NEW=True we skip updating old links - help="Also retry previously skipped/failed links when adding new links", - ) - parser.add_argument( - '--update-all', #'-n', - action='store_true', - default=False, - help="Also update ALL links in index when finished adding new links", - ) - parser.add_argument( - '--index-only', #'-o', - action='store_true', - help="Add the links to the main index without archiving them", - ) - parser.add_argument( - 'urls', - nargs='*', - type=str, - default=None, - help=( - 'URLs or paths to archive e.g.:\n' - ' https://getpocket.com/users/USERNAME/feed/all\n' - ' https://example.com/some/rss/feed.xml\n' - ' https://example.com\n' - ' ~/Downloads/firefox_bookmarks_export.html\n' - ' ~/Desktop/sites_list.csv\n' - ) - ) - parser.add_argument( - "--depth", - action="store", - default=0, - choices=[0, 1], - type=int, - help="Recursively archive all linked pages up to this many hops away" - ) - parser.add_argument( - "--overwrite", - default=False, - action="store_true", - help="Re-archive URLs from scratch, overwriting any existing files" - ) - parser.add_argument( - "--extract", '-e', - type=str, - help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration", - default="" - ) - parser.add_argument( - "--parser", - type=str, - help="Parser used to read inputted URLs.", - default="auto", - choices=["auto", *PARSERS.keys()], - ) - parser.add_argument( - "--persona", - type=str, - help="Name of accounts persona to use when archiving.", - default="Default", - ) - parser.add_argument( - "--bg", - default=False, - action="store_true", - help="Enqueue a background worker to complete the crawl instead of running it immediately", - ) - command = parser.parse_args(args or ()) - urls = command.urls - - stdin_urls = '' - if not urls: - stdin_urls = accept_stdin(stdin) - - if (stdin_urls and urls) or (not stdin and not urls): - stderr( - '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', - color='red', - ) - raise SystemExit(2) - add( - urls=stdin_urls or urls, - depth=command.depth, - tag=command.tag, - update=command.update, - update_all=command.update_all, - index_only=command.index_only, - overwrite=command.overwrite, - extractors=command.extract, - parser=command.parser, - persona=command.persona, - bg=command.bg, - ) + + add(**kwargs) if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main() diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index 5983dd43..2eb2676f 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -12,7 +12,130 @@ from typing import Optional, List, IO from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.misc.logging_util import SmartFormatter, accept_stdin -from ..main import config + + + +# @enforce_types +def config(config_options_str: Optional[str]=None, + config_options: Optional[List[str]]=None, + get: bool=False, + set: bool=False, + search: bool=False, + reset: bool=False, + out_dir: Path=DATA_DIR) -> None: + """Get and set your ArchiveBox project configuration values""" + + from rich import print + + check_data_folder() + if config_options and config_options_str: + stderr( + '[X] You should either pass config values as an arguments ' + 'or via stdin, but not both.\n', + color='red', + ) + raise SystemExit(2) + elif config_options_str: + config_options = config_options_str.split('\n') + + FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG() + CONFIGS = archivebox.pm.hook.get_CONFIGS() + + config_options = config_options or [] + + no_args = not (get or set or reset or config_options) + + matching_config = {} + if search: + if config_options: + config_options = [get_real_name(key) for key in config_options] + matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} + for config_section in CONFIGS.values(): + aliases = config_section.aliases + + for search_key in config_options: + # search all aliases in the section + for alias_key, key in aliases.items(): + if search_key.lower() in alias_key.lower(): + matching_config[key] = config_section.model_dump()[key] + + # search all keys and values in the section + for existing_key, value in config_section.model_dump().items(): + if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower(): + matching_config[existing_key] = value + + print(printable_config(matching_config)) + raise SystemExit(not matching_config) + elif get or no_args: + if config_options: + config_options = [get_real_name(key) for key in config_options] + matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} + failed_config = [key for key in config_options if key not in FLAT_CONFIG] + if failed_config: + stderr() + stderr('[X] These options failed to get', color='red') + stderr(' {}'.format('\n '.join(config_options))) + raise SystemExit(1) + else: + matching_config = FLAT_CONFIG + + print(printable_config(matching_config)) + raise SystemExit(not matching_config) + elif set: + new_config = {} + failed_options = [] + for line in config_options: + if line.startswith('#') or not line.strip(): + continue + if '=' not in line: + stderr('[X] Config KEY=VALUE must have an = sign in it', color='red') + stderr(f' {line}') + raise SystemExit(2) + + raw_key, val = line.split('=', 1) + raw_key = raw_key.upper().strip() + key = get_real_name(raw_key) + if key != raw_key: + stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow') + + if key in FLAT_CONFIG: + new_config[key] = val.strip() + else: + failed_options.append(line) + + if new_config: + before = FLAT_CONFIG + matching_config = write_config_file(new_config) + after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()} + print(printable_config(matching_config)) + + side_effect_changes = {} + for key, val in after.items(): + if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config): + side_effect_changes[key] = after[key] + # import ipdb; ipdb.set_trace() + + if side_effect_changes: + stderr() + stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow') + print(' {}'.format(printable_config(side_effect_changes, prefix=' '))) + if failed_options: + stderr() + stderr('[X] These options failed to set (check for typos):', color='red') + stderr(' {}'.format('\n '.join(failed_options))) + raise SystemExit(1) + elif reset: + stderr('[X] This command is not implemented yet.', color='red') + stderr(' Please manually remove the relevant lines from your config file:') + raise SystemExit(2) + else: + stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') + stderr(' archivebox config') + stderr(' archivebox config --get SOME_KEY') + stderr(' archivebox config --set SOME_KEY=SOME_VALUE') + raise SystemExit(2) + + @docstring(config.__doc__) diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py index c12319dd..4b6d68a2 100755 --- a/archivebox/cli/archivebox_help.py +++ b/archivebox/cli/archivebox_help.py @@ -1,32 +1,105 @@ #!/usr/bin/env python3 - __package__ = 'archivebox.cli' __command__ = 'archivebox help' -import sys -import argparse +import os from pathlib import Path -from typing import Optional, List, IO -from archivebox.misc.util import docstring -from archivebox.misc.logging_util import SmartFormatter, reject_stdin -from archivebox.config import DATA_DIR -from ..main import help +import click +from rich import print +from rich.panel import Panel -@docstring(help.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=help.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.parse_args(args or ()) - reject_stdin(__command__, stdin) +def help() -> None: + """Print the ArchiveBox help message and usage""" + + from archivebox.cli import ArchiveBoxGroup + from archivebox.config import CONSTANTS + from archivebox.config.permissions import IN_DOCKER + from archivebox.misc.logging_util import log_cli_command - help(out_dir=Path(pwd) if pwd else DATA_DIR) + log_cli_command('help', [], None, '.') + + COMMANDS_HELP_TEXT = '\n '.join( + f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}' + for cmd in ArchiveBoxGroup.meta_commands.keys() + ) + '\n\n ' + '\n '.join( + f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}' + for cmd in ArchiveBoxGroup.setup_commands.keys() + ) + '\n\n ' + '\n '.join( + f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}' + for cmd in ArchiveBoxGroup.archive_commands.keys() + ) + + DOCKER_USAGE = ''' +[dodger_blue3]Docker Usage:[/dodger_blue3] + [grey53]# using Docker Compose:[/grey53] + [blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] + [grey53]# using Docker:[/grey53] + [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] +''' if IN_DOCKER else '' + DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else '' + DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else '' + DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else '' + + print(f'''{DOCKER_USAGE} +[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT} + [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] + +[deep_sky_blue4]Commands:[/deep_sky_blue4] + {COMMANDS_HELP_TEXT} + +[deep_sky_blue4]Documentation:[/deep_sky_blue4] + [link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS} + [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link] + [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link] +''') + + + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir(): + pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~') + EXAMPLE_USAGE = f''' +[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] + +[violet]Hint:[/violet] [i]Common maintenance tasks:[/i] + [dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# make sure database is up-to-date (safe to run multiple times)[/grey53] + [dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53] + [dark_green]archivebox[/dark_green] [green]status[/green] [grey53]# get a health checkup report on your collection[/grey53] + [dark_green]archivebox[/dark_green] [green]update[/green] [grey53]# retry any previously failed or interrupted archiving tasks[/grey53] + +[violet]Hint:[/violet] [i]More example usage:[/i] + [dark_green]archivebox[/dark_green] [green]add[/green] --depth=1 "https://example.com/some/page" + [dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title + [dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss" + [dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53] +''' + print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.')) + else: + DATA_SETUP_HELP = '\n' + if IN_DOCKER: + DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n' + DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n' + DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n' + DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n' + DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n' + DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n' + DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n' + DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n' + DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n' + DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n' + DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n' + DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n' + DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n' + print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) + + + +@click.command() +@click.option('--help', '-h', is_flag=True, help='Show help') +def main(**kwargs): + """Print the ArchiveBox help message and usage""" + return help() if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main() diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 42c33a37..933495e3 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -5,13 +5,193 @@ __command__ = 'archivebox init' import sys import argparse - +from pathlib import Path from typing import Optional, List, IO + from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.misc.logging_util import SmartFormatter, reject_stdin -from ..main import init + + +def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Path=DATA_DIR) -> None: + """Initialize a new ArchiveBox collection in the current directory""" + + from core.models import Snapshot + from rich import print + + # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK): + # print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr) + # print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr) + + is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_DATA_DIR) + existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE) + if is_empty and not existing_index: + print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]') + print('[green]----------------------------------------------------------------------[/green]') + elif existing_index: + # TODO: properly detect and print the existing version in current index as well + print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]') + print('[green]----------------------------------------------------------------------[/green]') + else: + if force: + print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]') + print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]') + else: + print( + ("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n" + " You must run init in a completely empty directory, or an existing data folder.\n\n" + " [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n" + " then run and run 'archivebox init' to pick up where you left off.\n\n" + " (Always make sure your data folder is backed up first before updating ArchiveBox)" + ) + ) + raise SystemExit(2) + + if existing_index: + print('\n[green][*] Verifying archive folder structure...[/green]') + else: + print('\n[green][+] Building archive folder structure...[/green]') + + print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...') + Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True) + Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True) + Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) + + print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...') + + # create the .archivebox_id file with a unique ID for this collection + from archivebox.config.paths import _get_collection_id + _get_collection_id(CONSTANTS.DATA_DIR, force_create=True) + + # create the ArchiveBox.conf file + write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY}) + + + if os.access(CONSTANTS.DATABASE_FILE, os.F_OK): + print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]') + else: + print('\n[green][+] Building main SQL index and running initial migrations...[/green]') + + for migration_line in apply_migrations(out_dir): + sys.stdout.write(f' {migration_line}\n') + + assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK) + print() + print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}') + + # from django.contrib.auth.models import User + # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists(): + # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI)) + # call_command("createsuperuser", interactive=True) + + print() + print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]') + + all_links = Snapshot.objects.none() + pending_links: Dict[str, Link] = {} + + if existing_index: + all_links = load_main_index(out_dir=out_dir, warn=False) + print(f' √ Loaded {all_links.count()} links from existing main index.') + + if quick: + print(' > Skipping full snapshot directory check (quick mode)') + else: + try: + # Links in data folders that dont match their timestamp + fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) + if fixed: + print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]') + if cant_fix: + print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]') + + # Links in JSON index but not in main index + orphaned_json_links = { + link.url: link + for link in parse_json_main_index(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_json_links: + pending_links.update(orphaned_json_links) + print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]') + + # Links in data dir indexes but not in main index + orphaned_data_dir_links = { + link.url: link + for link in parse_json_links_details(out_dir) + if not all_links.filter(url=link.url).exists() + } + if orphaned_data_dir_links: + pending_links.update(orphaned_data_dir_links) + print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]') + + # Links in invalid/duplicate data dirs + invalid_folders = { + folder: link + for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() + } + if invalid_folders: + print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]') + print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items())) + print() + print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:') + print(' archivebox status') + print(' archivebox list --status=invalid') + + except (KeyboardInterrupt, SystemExit): + print(file=sys.stderr) + print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr) + print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr) + print(file=sys.stderr) + print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr) + print(' archivebox init --quick', file=sys.stderr) + raise SystemExit(1) + + write_main_index(list(pending_links.values()), out_dir=out_dir) + + print('\n[green]----------------------------------------------------------------------[/green]') + + from django.contrib.auth.models import User + + if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists(): + print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]') + User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD) + + if existing_index: + print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]') + else: + print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]') + + json_index = out_dir / CONSTANTS.JSON_INDEX_FILENAME + html_index = out_dir / CONSTANTS.HTML_INDEX_FILENAME + index_name = f"{date.today()}_index_old" + if os.access(json_index, os.F_OK): + json_index.rename(f"{index_name}.json") + if os.access(html_index, os.F_OK): + html_index.rename(f"{index_name}.html") + + CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True) + + from archivebox.config.common import STORAGE_CONFIG + STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True) + STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True) + + if install: + run_subcommand('install', pwd=out_dir) + + if Snapshot.objects.count() < 25: # hide the hints for experienced users + print() + print(' [violet]Hint:[/violet] To view your archive index, run:') + print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]') + print() + print(' To add new links, you can run:') + print(" archivebox add < ~/some/path/to/list_of_links.txt") + print() + print(' For more usage and examples, run:') + print(' archivebox help') @docstring(init.__doc__) diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index faf02fa3..e975171e 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -3,6 +3,7 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox install' +import os import sys import argparse from pathlib import Path @@ -11,11 +12,145 @@ from typing import Optional, List, IO from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.misc.logging_util import SmartFormatter, reject_stdin -from ..main import install + + +def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None: + """Automatically install all ArchiveBox dependencies and extras""" + + # if running as root: + # - run init to create index + lib dir + # - chown -R 911 DATA_DIR + # - install all binaries as root + # - chown -R 911 LIB_DIR + # else: + # - run init to create index + lib dir as current user + # - install all binaries as current user + # - recommend user re-run with sudo if any deps need to be installed as root + + from rich import print + + from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.config.paths import get_or_create_working_lib_dir + + if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()): + run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in + + print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]') + + # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID + if IS_ROOT: + EUID = os.geteuid() + + # if we have sudo/root permissions, take advantage of them just while installing dependencies + print() + print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]') + print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].') + print() + + LIB_DIR = get_or_create_working_lib_dir() + + package_manager_names = ', '.join( + f'[yellow]{binprovider.name}[/yellow]' + for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())) + if not binproviders or (binproviders and binprovider.name in binproviders) + ) + print(f'[+] Setting up package managers {package_manager_names}...') + for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())): + if binproviders and binprovider.name not in binproviders: + continue + try: + binprovider.setup() + except Exception: + # it's ok, installing binaries below will automatically set up package managers as needed + # e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed + # the next package that depends on npm will automatically call binprovider.setup() during its own install + pass + + print() + + for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())): + if binary.name in ('archivebox', 'django', 'sqlite', 'python'): + # obviously must already be installed if we are running + continue + + if binaries and binary.name not in binaries: + continue + + providers = ' [grey53]or[/grey53] '.join( + provider.name for provider in binary.binproviders_supported + if not binproviders or (binproviders and provider.name in binproviders) + ) + if not providers: + continue + print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...') + try: + with SudoPermission(uid=0, fallback=True): + # print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})) + if binproviders: + providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported] + for binprovider_name in binproviders: + if binprovider_name not in providers_supported_by_binary: + continue + try: + if dry_run: + # always show install commands when doing a dry run + sys.stderr.write("\033[2;49;90m") # grey53 + result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) + sys.stderr.write("\033[00m\n") # reset + else: + loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False) + result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) + if result and result['loaded_version']: + break + except Exception as e: + print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]') + else: + if dry_run: + sys.stderr.write("\033[2;49;90m") # grey53 + binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) + sys.stderr.write("\033[00m\n") # reset + else: + loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run) + result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) + if IS_ROOT and LIB_DIR: + with SudoPermission(uid=0): + if ARCHIVEBOX_USER == 0: + os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"') + else: + os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"') + except Exception as e: + print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]') + if binaries and len(binaries) == 1: + # if we are only installing a single binary, raise the exception so the user can see what went wrong + raise + + + from django.contrib.auth import get_user_model + User = get_user_model() + + if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): + stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green') + stderr(' archivebox manage createsuperuser') + # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) + + print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr) + + from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY + + extra_args = [] + if binproviders: + extra_args.append(f'--binproviders={",".join(binproviders)}') + if binaries: + extra_args.append(f'--binaries={",".join(binaries)}') + + proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=out_dir) + raise SystemExit(proc.returncode) + @docstring(install.__doc__) def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: + parser = argparse.ArgumentParser( prog=__command__, description=install.__doc__, diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py deleted file mode 100644 index e34717bc..00000000 --- a/archivebox/cli/archivebox_list.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python3 - -__package__ = 'archivebox.cli' -__command__ = 'archivebox list' - -import sys -import argparse -from pathlib import Path -from typing import Optional, List, IO - -from archivebox.config import DATA_DIR -from archivebox.misc.util import docstring -from archivebox.misc.logging_util import SmartFormatter, reject_stdin, stderr -from ..main import list_all -from ..index import ( - LINK_FILTERS, - get_indexed_folders, - get_archived_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_invalid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, -) - - -@docstring(list_all.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=list_all.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--csv', #'-c', - type=str, - help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension", - default=None, - ) - group.add_argument( - '--json', #'-j', - action='store_true', - help="Print the output in JSON format with all columns included", - ) - group.add_argument( - '--html', - action='store_true', - help="Print the output in HTML format" - ) - parser.add_argument( - '--with-headers', - action='store_true', - help='Include the headers in the output document' - ) - parser.add_argument( - '--sort', #'-s', - type=str, - help="List the links sorted using the given key, e.g. timestamp or updated", - default=None, - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="List only links bookmarked before (less than) the given timestamp", - default=None, - ) - parser.add_argument( - '--after', #'-a', - type=float, - help="List only links bookmarked after (greater than or equal to) the given timestamp", - default=None, - ) - parser.add_argument( - '--status', - type=str, - choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'), - default='indexed', - help=( - 'List only links or data directories that have the given status\n' - f' indexed {get_indexed_folders.__doc__} (the default)\n' - f' archived {get_archived_folders.__doc__}\n' - f' unarchived {get_unarchived_folders.__doc__}\n' - '\n' - f' present {get_present_folders.__doc__}\n' - f' valid {get_valid_folders.__doc__}\n' - f' invalid {get_invalid_folders.__doc__}\n' - '\n' - f' duplicate {get_duplicate_folders.__doc__}\n' - f' orphaned {get_orphaned_folders.__doc__}\n' - f' corrupted {get_corrupted_folders.__doc__}\n' - f' unrecognized {get_unrecognized_folders.__doc__}\n' - ) - ) - parser.add_argument( - '--filter-type', '-t', - type=str, - choices=(*LINK_FILTERS.keys(), 'search'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - default=None, - help='List only URLs matching these filter patterns' - ) - command = parser.parse_args(args or ()) - reject_stdin(stdin) - - if command.with_headers and not (command.json or command.html or command.csv): - stderr( - '[X] --with-headers can only be used with --json, --html or --csv options\n', - color='red', - ) - raise SystemExit(2) - - matching_folders = list_all( - filter_patterns=command.filter_patterns, - filter_type=command.filter_type, - status=command.status, - after=command.after, - before=command.before, - sort=command.sort, - csv=command.csv, - json=command.json, - html=command.html, - with_headers=command.with_headers, - out_dir=Path(pwd) if pwd else DATA_DIR, - ) - raise SystemExit(not matching_folders) - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_manage.py b/archivebox/cli/archivebox_manage.py index 1ae8e2d5..63ff354b 100644 --- a/archivebox/cli/archivebox_manage.py +++ b/archivebox/cli/archivebox_manage.py @@ -9,7 +9,27 @@ from typing import Optional, List, IO from archivebox.misc.util import docstring from archivebox.config import DATA_DIR -from ..main import manage + + + +# @enforce_types +def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None: + """Run an ArchiveBox Django management command""" + + check_data_folder() + from django.core.management import execute_from_command_line + + if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY): + stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow') + stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') + stderr('') + + # import ipdb; ipdb.set_trace() + + execute_from_command_line(['manage.py', *(args or ['help'])]) + + + @docstring(manage.__doc__) diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py index 75dc53a6..e3ef0b3f 100644 --- a/archivebox/cli/archivebox_oneshot.py +++ b/archivebox/cli/archivebox_oneshot.py @@ -1,73 +1,98 @@ -#!/usr/bin/env python3 +# #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox oneshot' +################## DEPRECATED IN FAVOR OF abx-dl ##################### +# https://github.com/ArchiveBox/abx-dl -import sys -import argparse +# __package__ = 'archivebox.cli' +# __command__ = 'archivebox oneshot' -from pathlib import Path -from typing import List, Optional, IO +# import sys +# import argparse -from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR -from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr -from ..main import oneshot +# from pathlib import Path +# from typing import List, Optional, IO + +# from archivebox.misc.util import docstring +# from archivebox.config import DATA_DIR +# from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr -@docstring(oneshot.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=oneshot.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - 'url', - type=str, - default=None, - help=( - 'URLs or paths to archive e.g.:\n' - ' https://getpocket.com/users/USERNAME/feed/all\n' - ' https://example.com/some/rss/feed.xml\n' - ' https://example.com\n' - ' ~/Downloads/firefox_bookmarks_export.html\n' - ' ~/Desktop/sites_list.csv\n' - ) - ) - parser.add_argument( - "--extract", - type=str, - help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration", - default="" - ) - parser.add_argument( - '--out-dir', - type=str, - default=DATA_DIR, - help= "Path to save the single archive folder to, e.g. ./example.com_archive" - ) - command = parser.parse_args(args or ()) - stdin_url = None - url = command.url - if not url: - stdin_url = accept_stdin(stdin) +# @enforce_types +# def oneshot(url: str, extractors: str="", out_dir: Path=DATA_DIR, created_by_id: int | None=None) -> List[Link]: +# """ +# Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. +# You can run this to archive single pages without needing to create a whole collection with archivebox init. +# """ +# oneshot_link, _ = parse_links_memory([url]) +# if len(oneshot_link) > 1: +# stderr( +# '[X] You should pass a single url to the oneshot command', +# color='red' +# ) +# raise SystemExit(2) - if (stdin_url and url) or (not stdin and not url): - stderr( - '[X] You must pass a URL/path to add via stdin or CLI arguments.\n', - color='red', - ) - raise SystemExit(2) +# methods = extractors.split(",") if extractors else ignore_methods(['title']) +# archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, created_by_id=created_by_id) +# return oneshot_link + + + + + + +# @docstring(oneshot.__doc__) +# def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: +# parser = argparse.ArgumentParser( +# prog=__command__, +# description=oneshot.__doc__, +# add_help=True, +# formatter_class=SmartFormatter, +# ) +# parser.add_argument( +# 'url', +# type=str, +# default=None, +# help=( +# 'URLs or paths to archive e.g.:\n' +# ' https://getpocket.com/users/USERNAME/feed/all\n' +# ' https://example.com/some/rss/feed.xml\n' +# ' https://example.com\n' +# ' ~/Downloads/firefox_bookmarks_export.html\n' +# ' ~/Desktop/sites_list.csv\n' +# ) +# ) +# parser.add_argument( +# "--extract", +# type=str, +# help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ +# This does not take precedence over the configuration", +# default="" +# ) +# parser.add_argument( +# '--out-dir', +# type=str, +# default=DATA_DIR, +# help= "Path to save the single archive folder to, e.g. ./example.com_archive" +# ) +# command = parser.parse_args(args or ()) +# stdin_url = None +# url = command.url +# if not url: +# stdin_url = accept_stdin(stdin) + +# if (stdin_url and url) or (not stdin and not url): +# stderr( +# '[X] You must pass a URL/path to add via stdin or CLI arguments.\n', +# color='red', +# ) +# raise SystemExit(2) - oneshot( - url=stdin_url or url, - out_dir=Path(command.out_dir).resolve(), - extractors=command.extract, - ) +# oneshot( +# url=stdin_url or url, +# out_dir=Path(command.out_dir).resolve(), +# extractors=command.extract, +# ) -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +# if __name__ == '__main__': +# main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index 40c426b4..317dc792 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -8,10 +8,93 @@ import argparse from pathlib import Path from typing import Optional, List, IO +from django.db.models import QuerySet + from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.misc.logging_util import SmartFormatter, accept_stdin -from ..main import remove +from archivebox.index.schema import Link + + +def remove(filter_str: Optional[str]=None, + filter_patterns: Optional[list[str]]=None, + filter_type: str='exact', + snapshots: Optional[QuerySet]=None, + after: Optional[float]=None, + before: Optional[float]=None, + yes: bool=False, + delete: bool=False, + out_dir: Path=DATA_DIR) -> list[Link]: + """Remove the specified URLs from the archive""" + + check_data_folder() + + if snapshots is None: + if filter_str and filter_patterns: + stderr( + '[X] You should pass either a pattern as an argument, ' + 'or pass a list of patterns via stdin, but not both.\n', + color='red', + ) + raise SystemExit(2) + elif not (filter_str or filter_patterns): + stderr( + '[X] You should pass either a pattern as an argument, ' + 'or pass a list of patterns via stdin.', + color='red', + ) + stderr() + hint(('To remove all urls you can run:', + 'archivebox remove --filter-type=regex ".*"')) + stderr() + raise SystemExit(2) + elif filter_str: + filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] + + list_kwargs = { + "filter_patterns": filter_patterns, + "filter_type": filter_type, + "after": after, + "before": before, + } + if snapshots: + list_kwargs["snapshots"] = snapshots + + log_list_started(filter_patterns, filter_type) + timer = TimedProgress(360, prefix=' ') + try: + snapshots = list_links(**list_kwargs) + finally: + timer.end() + + + if not snapshots.exists(): + log_removal_finished(0, 0) + raise SystemExit(1) + + + log_links = [link.as_link() for link in snapshots] + log_list_finished(log_links) + log_removal_started(log_links, yes=yes, delete=delete) + + timer = TimedProgress(360, prefix=' ') + try: + for snapshot in snapshots: + if delete: + shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True) + finally: + timer.end() + + to_remove = snapshots.count() + + from .search import flush_search_index + + flush_search_index(snapshots=snapshots) + remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) + all_snapshots = load_main_index(out_dir=out_dir) + log_removal_finished(all_snapshots.count(), to_remove) + + return all_snapshots @docstring(remove.__doc__) diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index 4f791421..d2f85c84 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -11,7 +11,139 @@ from typing import Optional, List, IO from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.misc.logging_util import SmartFormatter, reject_stdin -from ..main import schedule +from archivebox.config.common import ARCHIVING_CONFIG + + +# @enforce_types +def schedule(add: bool=False, + show: bool=False, + clear: bool=False, + foreground: bool=False, + run_all: bool=False, + quiet: bool=False, + every: Optional[str]=None, + tag: str='', + depth: int=0, + overwrite: bool=False, + update: bool=not ARCHIVING_CONFIG.ONLY_NEW, + import_path: Optional[str]=None, + out_dir: Path=DATA_DIR): + """Set ArchiveBox to regularly import URLs at specific times using cron""" + + check_data_folder() + from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY + from archivebox.config.permissions import USER + + Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) + + cron = CronTab(user=True) + cron = dedupe_cron_jobs(cron) + + if clear: + print(cron.remove_all(comment=CRON_COMMENT)) + cron.write() + raise SystemExit(0) + + existing_jobs = list(cron.find_comment(CRON_COMMENT)) + + if every or add: + every = every or 'day' + quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s) + cmd = [ + 'cd', + quoted(out_dir), + '&&', + quoted(ARCHIVEBOX_BINARY.load().abspath), + *([ + 'add', + *(['--overwrite'] if overwrite else []), + *(['--update'] if update else []), + *([f'--tag={tag}'] if tag else []), + f'--depth={depth}', + f'"{import_path}"', + ] if import_path else ['update']), + '>>', + quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'), + '2>&1', + + ] + new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) + + if every in ('minute', 'hour', 'day', 'month', 'year'): + set_every = getattr(new_job.every(), every) + set_every() + elif CronSlices.is_valid(every): + new_job.setall(every) + else: + stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI)) + stderr(' It must be one of minute/hour/day/month') + stderr(' or a quoted cron-format schedule like:') + stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') + stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml') + raise SystemExit(1) + + cron = dedupe_cron_jobs(cron) + cron.write() + + total_runs = sum(j.frequency_per_year() for j in cron) + existing_jobs = list(cron.find_comment(CRON_COMMENT)) + + print() + print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) + print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) + if total_runs > 60 and not quiet: + stderr() + stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI)) + stderr(' Congrats on being an enthusiastic internet archiver! 👌') + stderr() + stderr(' Make sure you have enough storage space available to hold all the data.') + stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') + stderr('') + elif show: + if existing_jobs: + print('\n'.join(str(cmd) for cmd in existing_jobs)) + else: + stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI)) + stderr(' To schedule a new job, run:') + stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') + raise SystemExit(0) + + cron = CronTab(user=True) + cron = dedupe_cron_jobs(cron) + existing_jobs = list(cron.find_comment(CRON_COMMENT)) + + if foreground or run_all: + if not existing_jobs: + stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI)) + stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') + raise SystemExit(1) + + print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI)) + if run_all: + try: + for job in existing_jobs: + sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n') + sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}') + sys.stdout.flush() + job.run() + sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n') + except KeyboardInterrupt: + print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) + raise SystemExit(1) + + if foreground: + try: + for job in existing_jobs: + print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}') + for result in cron.run_scheduler(): + print(result) + except KeyboardInterrupt: + print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) + raise SystemExit(1) + + # if CAN_UPGRADE: + # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") + @docstring(schedule.__doc__) diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py new file mode 100644 index 00000000..06ee293e --- /dev/null +++ b/archivebox/cli/archivebox_search.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox search' + +from pathlib import Path +from typing import Optional, List, Iterable + +import rich_click as click +from rich import print + +from django.db.models import QuerySet + +from archivebox.config import DATA_DIR +from archivebox.index import LINK_FILTERS +from archivebox.index.schema import Link +from archivebox.misc.logging import stderr +from archivebox.misc.util import enforce_types, docstring + +STATUS_CHOICES = [ + 'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', + 'duplicate', 'orphaned', 'corrupted', 'unrecognized' +] + + + +def list_links(snapshots: Optional[QuerySet]=None, + filter_patterns: Optional[List[str]]=None, + filter_type: str='substring', + after: Optional[float]=None, + before: Optional[float]=None, + out_dir: Path=DATA_DIR) -> Iterable[Link]: + + from archivebox.index import load_main_index + from archivebox.index import snapshot_filter + + if snapshots: + all_snapshots = snapshots + else: + all_snapshots = load_main_index(out_dir=out_dir) + + if after is not None: + all_snapshots = all_snapshots.filter(timestamp__gte=after) + if before is not None: + all_snapshots = all_snapshots.filter(timestamp__lt=before) + if filter_patterns: + all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type) + + if not all_snapshots: + stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') + + return all_snapshots + + +def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]: + + from archivebox.misc.checks import check_data_folder + from archivebox.index import ( + get_indexed_folders, + get_archived_folders, + get_unarchived_folders, + get_present_folders, + get_valid_folders, + get_invalid_folders, + get_duplicate_folders, + get_orphaned_folders, + get_corrupted_folders, + get_unrecognized_folders, + ) + + check_data_folder() + + STATUS_FUNCTIONS = { + "indexed": get_indexed_folders, + "archived": get_archived_folders, + "unarchived": get_unarchived_folders, + "present": get_present_folders, + "valid": get_valid_folders, + "invalid": get_invalid_folders, + "duplicate": get_duplicate_folders, + "orphaned": get_orphaned_folders, + "corrupted": get_corrupted_folders, + "unrecognized": get_unrecognized_folders, + } + + try: + return STATUS_FUNCTIONS[status](links, out_dir=out_dir) + except KeyError: + raise ValueError('Status not recognized.') + + + + +@enforce_types +def search(filter_patterns: list[str] | None=None, + filter_type: str='substring', + status: str='indexed', + before: float | None=None, + after: float | None=None, + sort: str | None=None, + json: bool=False, + html: bool=False, + csv: str | None=None, + with_headers: bool=False): + """List, filter, and export information about archive entries""" + + + if with_headers and not (json or html or csv): + stderr('[X] --with-headers requires --json, --html or --csv\n', color='red') + raise SystemExit(2) + + snapshots = list_links( + filter_patterns=list(filter_patterns) if filter_patterns else None, + filter_type=filter_type, + before=before, + after=after, + ) + + if sort: + snapshots = snapshots.order_by(sort) + + folders = list_folders( + links=snapshots, + status=status, + out_dir=DATA_DIR, + ) + + if json: + from archivebox.index.json import generate_json_index_from_links + output = generate_json_index_from_links(folders.values(), with_headers) + elif html: + from archivebox.index.html import generate_index_from_links + output = generate_index_from_links(folders.values(), with_headers) + elif csv: + from archivebox.index.csv import links_to_csv + output = links_to_csv(folders.values(), csv.split(','), with_headers) + else: + from archivebox.misc.logging_util import printable_folders + output = printable_folders(folders, with_headers) + + print(output) + return output + + +@click.command() +@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs') +@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status') +@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp') +@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp') +@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at') +@click.option('--json', '-J', is_flag=True, help='Print output in JSON format') +@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)') +@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title') +@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output') +@click.help_option('--help', '-h') +@click.argument('filter_patterns', nargs=-1) +@docstring(search.__doc__) +def main(**kwargs): + return search(**kwargs) + + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index 4f1e2a60..470e0d8b 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -12,7 +12,81 @@ from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG from archivebox.misc.logging_util import SmartFormatter, reject_stdin -from ..main import server + + + +# @enforce_types +def server(runserver_args: Optional[List[str]]=None, + reload: bool=False, + debug: bool=False, + init: bool=False, + quick_init: bool=False, + createsuperuser: bool=False, + daemonize: bool=False, + out_dir: Path=DATA_DIR) -> None: + """Run the ArchiveBox HTTP server""" + + from rich import print + + runserver_args = runserver_args or [] + + if init: + run_subcommand('init', stdin=None, pwd=out_dir) + print() + elif quick_init: + run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir) + print() + + if createsuperuser: + run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) + print() + + + check_data_folder() + + from django.core.management import call_command + from django.contrib.auth.models import User + + if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): + print() + # print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]') + print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:') + print(' [green]archivebox manage createsuperuser[/green]') + print() + + + host = '127.0.0.1' + port = '8000' + + try: + host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0] + if ':' in host_and_port: + host, port = host_and_port.split(':') + else: + if '.' in host_and_port: + host = host_and_port + else: + port = host_and_port + except IndexError: + pass + + print('[green][+] Starting ArchiveBox webserver...[/green]') + print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') + print(' > Writing ArchiveBox error log to ./logs/errors.log') + + if SHELL_CONFIG.DEBUG: + if not reload: + runserver_args.append('--noreload') # '--insecure' + call_command("runserver", *runserver_args) + else: + from workers.supervisord_util import start_server_workers + + print() + start_server_workers(host=host, port=port, daemonize=False) + print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") + + @docstring(server.__doc__) def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py index 3b035c82..a7d90a51 100644 --- a/archivebox/cli/archivebox_shell.py +++ b/archivebox/cli/archivebox_shell.py @@ -11,7 +11,19 @@ from typing import Optional, List, IO from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.misc.logging_util import SmartFormatter, reject_stdin -from ..main import shell + + + +#@enforce_types +def shell(out_dir: Path=DATA_DIR) -> None: + """Enter an interactive ArchiveBox Django shell""" + + check_data_folder() + + from django.core.management import call_command + call_command("shell_plus") + + @docstring(shell.__doc__) diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index 017c1ea1..9b80d1d8 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -8,10 +8,114 @@ import argparse from pathlib import Path from typing import Optional, List, IO +from rich import print + from archivebox.misc.util import docstring from archivebox.config import DATA_DIR from archivebox.misc.logging_util import SmartFormatter, reject_stdin -from ..main import status + + + + +# @enforce_types +def status(out_dir: Path=DATA_DIR) -> None: + """Print out some info and statistics about the archive collection""" + + check_data_folder() + + from core.models import Snapshot + from django.contrib.auth import get_user_model + User = get_user_model() + + print('{green}[*] Scanning archive main index...{reset}'.format(**SHELL_CONFIG.ANSI)) + print(SHELL_CONFIG.ANSI['lightyellow'], f' {out_dir}/*', SHELL_CONFIG.ANSI['reset']) + num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') + size = printable_filesize(num_bytes) + print(f' Index size: {size} across {num_files} files') + print() + + links = load_main_index(out_dir=out_dir) + num_sql_links = links.count() + num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) + print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})') + print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)') + print() + print('{green}[*] Scanning archive data directories...{reset}'.format(**SHELL_CONFIG.ANSI)) + print(SHELL_CONFIG.ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', SHELL_CONFIG.ANSI['reset']) + num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) + size = printable_filesize(num_bytes) + print(f' Size: {size} across {num_files} files in {num_dirs} directories') + print(SHELL_CONFIG.ANSI['black']) + num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) + num_archived = len(get_archived_folders(links, out_dir=out_dir)) + num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) + print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') + print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') + print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') + + num_present = len(get_present_folders(links, out_dir=out_dir)) + num_valid = len(get_valid_folders(links, out_dir=out_dir)) + print() + print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') + print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') + + duplicate = get_duplicate_folders(links, out_dir=out_dir) + orphaned = get_orphaned_folders(links, out_dir=out_dir) + corrupted = get_corrupted_folders(links, out_dir=out_dir) + unrecognized = get_unrecognized_folders(links, out_dir=out_dir) + num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) + print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') + print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') + print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') + print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') + print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') + + print(SHELL_CONFIG.ANSI['reset']) + + if num_indexed: + print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**SHELL_CONFIG.ANSI)) + print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)') + + if orphaned: + print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**SHELL_CONFIG.ANSI)) + print(' archivebox init') + + if num_invalid: + print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**SHELL_CONFIG.ANSI)) + print(' archivebox init') + + print() + print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**SHELL_CONFIG.ANSI)) + print(SHELL_CONFIG.ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', SHELL_CONFIG.ANSI['reset']) + users = get_admins().values_list('username', flat=True) + print(f' UI users {len(users)}: {", ".join(users)}') + last_login = User.objects.order_by('last_login').last() + if last_login: + print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}') + last_downloaded = Snapshot.objects.order_by('downloaded_at').last() + if last_downloaded: + print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}') + + if not users: + print() + print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**SHELL_CONFIG.ANSI)) + print(' archivebox manage createsuperuser') + + print() + for snapshot in links.order_by('-downloaded_at')[:10]: + if not snapshot.downloaded_at: + continue + print( + SHELL_CONFIG.ANSI['black'], + ( + f' > {str(snapshot.downloaded_at)[:16]} ' + f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' + f'"{snapshot.title}": {snapshot.url}' + )[:SHELL_CONFIG.TERM_WIDTH], + SHELL_CONFIG.ANSI['reset'], + ) + print(SHELL_CONFIG.ANSI['black'], ' ...', SHELL_CONFIG.ANSI['reset']) + @docstring(status.__doc__) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 3fc3d116..9694b6e6 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -24,7 +24,92 @@ from archivebox.index import ( from archivebox.misc.logging_util import SmartFormatter, accept_stdin # from ..main import update + + + +# LEGACY VERSION: +# @enforce_types +# def update(resume: Optional[float]=None, +# only_new: bool=ARCHIVING_CONFIG.ONLY_NEW, +# index_only: bool=False, +# overwrite: bool=False, +# filter_patterns_str: Optional[str]=None, +# filter_patterns: Optional[List[str]]=None, +# filter_type: Optional[str]=None, +# status: Optional[str]=None, +# after: Optional[str]=None, +# before: Optional[str]=None, +# extractors: str="", +# out_dir: Path=DATA_DIR) -> List[Link]: +# """Import any new links from subscriptions and retry any previously failed/skipped links""" + +# from core.models import ArchiveResult +# from .search import index_links +# # from workers.supervisord_util import start_cli_workers + + +# check_data_folder() +# # start_cli_workers() +# new_links: List[Link] = [] # TODO: Remove input argument: only_new + +# extractors = extractors.split(",") if extractors else [] + +# # Step 1: Filter for selected_links +# print('[*] Finding matching Snapshots to update...') +# print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...') +# matching_snapshots = list_links( +# filter_patterns=filter_patterns, +# filter_type=filter_type, +# before=before, +# after=after, +# ) +# print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...') +# matching_folders = list_folders( +# links=matching_snapshots, +# status=status, +# out_dir=out_dir, +# ) +# all_links = (link for link in matching_folders.values() if link) +# print(' - Sorting by most unfinished -> least unfinished + date archived...') +# all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp)) + +# if index_only: +# for link in all_links: +# write_link_details(link, out_dir=out_dir, skip_sql_index=True) +# index_links(all_links, out_dir=out_dir) +# return all_links + +# # Step 2: Run the archive methods for each link +# to_archive = new_links if only_new else all_links +# if resume: +# to_archive = [ +# link for link in to_archive +# if link.timestamp >= str(resume) +# ] +# if not to_archive: +# stderr('') +# stderr(f'[√] Nothing found to resume after {resume}', color='green') +# return all_links + +# archive_kwargs = { +# "out_dir": out_dir, +# } +# if extractors: +# archive_kwargs["methods"] = extractors + + +# archive_links(to_archive, overwrite=overwrite, **archive_kwargs) + +# # Step 4: Re-write links index with updated titles, icons, and resources +# all_links = load_main_index(out_dir=out_dir) +# return all_links + + + + + def update(): + """Import any new links from subscriptions and retry any previously failed/skipped links""" from archivebox.config.django import setup_django setup_django() diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 4d55227d..a1831290 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -1,61 +1,207 @@ #!/usr/bin/env python3 __package__ = 'archivebox.cli' -__command__ = 'archivebox version' import sys -import argparse -from pathlib import Path -from typing import Optional, List, IO +from typing import Iterable -# from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR, VERSION -from archivebox.misc.logging_util import SmartFormatter, reject_stdin +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types -# @docstring(version.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - """Print the ArchiveBox version and dependency information""" - parser = argparse.ArgumentParser( - prog=__command__, - description="Print the ArchiveBox version and dependency information", # version.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--quiet', '-q', - action='store_true', - help='Only print ArchiveBox version number and nothing else.', - ) - parser.add_argument( - '--binproviders', '-p', - type=str, - help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)', - default=None, - ) - parser.add_argument( - '--binaries', '-b', - type=str, - help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)', - default=None, - ) - command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) +@enforce_types +def version(quiet: bool=False, + binproviders: Iterable[str]=(), + binaries: Iterable[str]=()) -> list[str]: + """Print the ArchiveBox version, debug metadata, and installed dependency versions""" - # for speed reasons, check if quiet flag was set and just return simple version immediately if so - if command.quiet: - print(VERSION) - return + # fast path for just getting the version and exiting, dont do any slower imports + from archivebox.config.version import VERSION + print(VERSION) + if quiet or '--version' in sys.argv: + return [] - # otherwise do big expensive import to get the full version - from ..main import version - version( - quiet=command.quiet, - out_dir=Path(pwd) if pwd else DATA_DIR, - binproviders=command.binproviders.split(',') if command.binproviders else None, - binaries=command.binaries.split(',') if command.binaries else None, + # Only do slower imports when getting full version info + import os + import platform + from pathlib import Path + + from rich.panel import Panel + from rich.console import Console + from abx_pkg import Binary + + import abx + import archivebox + from archivebox.config import CONSTANTS, DATA_DIR + from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER + from archivebox.config.paths import get_data_locations, get_code_locations + from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG + from archivebox.misc.logging_util import printable_folder_status + + from abx_plugin_default_binproviders import apt, brew, env + + console = Console() + prnt = console.print + + LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED + + # 0.7.1 + # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365 + # IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython + # FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644 + # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False + + p = platform.uname() + COMMIT_HASH = get_COMMIT_HASH() + prnt( + '[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION), + f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', + f'BUILD_TIME={get_BUILD_TIME()}', ) + prnt( + f'IN_DOCKER={IN_DOCKER}', + f'IN_QEMU={SHELL_CONFIG.IN_QEMU}', + f'ARCH={p.machine}', + f'OS={p.system}', + f'PLATFORM={platform.platform()}', + f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''), + ) + OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount + DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() + prnt( + f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}', + f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}', + f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}', + f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', + f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', + ) + prnt( + f'DEBUG={SHELL_CONFIG.DEBUG}', + f'IS_TTY={SHELL_CONFIG.IS_TTY}', + f'SUDO={CONSTANTS.IS_ROOT}', + f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}', + f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}', + f'LDAP={LDAP_ENABLED}', + #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually + ) + prnt() + + if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)): + PANEL_TEXT = '\n'.join(( + # '', + # f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]', + '', + '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...', + ' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.', + '', + ' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]', + '', + )) + prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) + prnt() + return [] + + prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]') + failures = [] + BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES()) + for name, binary in list(BINARIES.items()): + if binary.name == 'archivebox': + continue + + # skip if the binary is not in the requested list of binaries + if binaries and binary.name not in binaries: + continue + + # skip if the binary is not supported by any of the requested binproviders + if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported): + continue + + err = None + try: + loaded_bin = binary.load() + except Exception as e: + err = e + loaded_bin = binary + provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] ' + if loaded_bin.abspath: + abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~') + if ' ' in abspath: + abspath = abspath.replace(' ', r'\ ') + else: + abspath = f'[red]{err}[/red]' + prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False) + if not loaded_bin.is_valid: + failures.append(loaded_bin.name) + + prnt() + prnt('[gold3][i] Package Managers:[/gold3]') + BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()) + for name, binprovider in list(BINPROVIDERS.items()): + err = None + + if binproviders and binprovider.name not in binproviders: + continue + + # TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN + loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew]) + + abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~') + abspath = None + if loaded_bin.abspath: + abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~') + if ' ' in abspath: + abspath = abspath.replace(' ', r'\ ') + + PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~') + ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]' + provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]' + prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True) + + if not (binaries or binproviders): + # dont show source code / data dir info if we just want to get version info for a binary or binprovider + + prnt() + prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]') + for name, path in get_code_locations().items(): + prnt(printable_folder_status(name, path), overflow='ignore', crop=False) + + prnt() + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): + prnt('[bright_yellow][i] Data locations:[/bright_yellow]') + for name, path in get_data_locations().items(): + prnt(printable_folder_status(name, path), overflow='ignore', crop=False) + + from archivebox.misc.checks import check_data_dir_permissions + + check_data_dir_permissions() + else: + prnt() + prnt('[red][i] Data locations:[/red] (not in a data directory)') + + prnt() + + if failures: + prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]') + prnt(f' [red]{", ".join(failures)}[/red]') + prnt() + prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:') + prnt(' [green]archivebox install[/green]') + prnt() + return failures + + +@click.command() +@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)') +@click.option('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)') +@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)') +@docstring(version.__doc__) +def main(**kwargs): + failures = version(**kwargs) + if failures: + raise SystemExit(1) if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main() diff --git a/archivebox/config/django.py b/archivebox/config/django.py index 3fb85eb5..77169ee3 100644 --- a/archivebox/config/django.py +++ b/archivebox/config/django.py @@ -60,7 +60,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None: return with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS: - INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=False) + INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True) from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py index 8edb0759..a6d2b2bd 100644 --- a/archivebox/config/paths.py +++ b/archivebox/config/paths.py @@ -142,7 +142,7 @@ def create_and_chown_dir(dir_path: Path) -> None: os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &') @cache -def get_or_create_working_tmp_dir(autofix=True, quiet=False): +def get_or_create_working_tmp_dir(autofix=True, quiet=True): from archivebox import CONSTANTS from archivebox.config.common import STORAGE_CONFIG from archivebox.misc.checks import check_tmp_dir @@ -165,7 +165,7 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=False): pass if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True): if autofix and STORAGE_CONFIG.TMP_DIR != candidate: - STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet) + STORAGE_CONFIG.update_in_place(TMP_DIR=candidate) return candidate if not quiet: @@ -193,7 +193,7 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False): pass if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True): if autofix and STORAGE_CONFIG.LIB_DIR != candidate: - STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet) + STORAGE_CONFIG.update_in_place(LIB_DIR=candidate) return candidate if not quiet: diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py index 98a624c6..08d81ce6 100644 --- a/archivebox/config/permissions.py +++ b/archivebox/config/permissions.py @@ -36,6 +36,8 @@ HOSTNAME: str = max([socket.gethostname(), platform.node()], key=len) IS_ROOT = RUNNING_AS_UID == 0 IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') +# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose + FALLBACK_UID = RUNNING_AS_UID or SUDO_UID FALLBACK_GID = RUNNING_AS_GID or SUDO_GID diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 2a13d498..2f70b382 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -303,7 +303,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext: "Exit Status": [], } - from workers.supervisor_util import get_existing_supervisord_process + from workers.supervisord_util import get_existing_supervisord_process supervisor = get_existing_supervisord_process() if supervisor is None: @@ -373,7 +373,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext: def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: assert request.user.is_superuser, "Must be a superuser to view configuration settings." - from workers.supervisor_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME + from workers.supervisord_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME SOCK_FILE = get_sock_file() CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index 4b9b66b3..b821e9c7 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -21,7 +21,6 @@ from archivebox.misc.logging_util import printable_filesize from archivebox.search.admin import SearchResultsAdminMixin from archivebox.index.html import snapshot_icons from archivebox.extractors import archive_links -from archivebox.main import remove from archivebox.base_models.admin import ABIDModelAdmin from archivebox.workers.tasks import bg_archive_links, bg_add @@ -321,7 +320,9 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): description="☠️ Delete" ) def delete_snapshots(self, request, queryset): + from archivebox.cli.archivebox_remove import remove remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR) + messages.success( request, mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), diff --git a/archivebox/main.py b/archivebox/main.py deleted file mode 100755 index 74f8813c..00000000 --- a/archivebox/main.py +++ /dev/null @@ -1,1526 +0,0 @@ -__package__ = 'archivebox' - -import os -import sys -import shutil -import platform - -from typing import Dict, List, Optional, Iterable, IO, Union -from pathlib import Path -from datetime import date, datetime - -from crontab import CronTab, CronSlices - -from django.db.models import QuerySet -from django.utils import timezone - -from abx_pkg import Binary - -import abx -import archivebox -from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR -from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG -from archivebox.config.permissions import SudoPermission, IN_DOCKER -from archivebox.config.collection import write_config_file, load_all_config, get_real_name -from archivebox.misc.checks import check_data_folder -from archivebox.misc.util import enforce_types # type: ignore -from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT -from archivebox.misc.system import run as run_shell -from archivebox.misc.logging import stderr, hint -from archivebox.misc.logging_util import ( - TimedProgress, - log_importing_started, - log_crawl_started, - log_removal_started, - log_removal_finished, - log_list_started, - log_list_finished, - printable_config, - printable_folders, - printable_filesize, - printable_folder_status, -) - - -from .cli import ( - CLI_SUBCOMMANDS, - run_subcommand, - display_first, - meta_cmds, - setup_cmds, - archive_cmds, -) -from .parsers import ( - save_text_as_source, - save_file_as_source, - parse_links_memory, -) -from .index.schema import Link -from .index import ( - load_main_index, - parse_links_from_source, - dedupe_links, - write_main_index, - snapshot_filter, - get_indexed_folders, - get_archived_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_invalid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, - fix_invalid_folder_locations, - write_link_details, -) -from .index.json import ( - parse_json_main_index, - parse_json_links_details, - generate_json_index_from_links, -) -from .index.sql import ( - get_admins, - apply_migrations, - remove_from_sql_main_index, -) -from .index.html import generate_index_from_links -from .index.csv import links_to_csv -from .extractors import archive_links, archive_link, ignore_methods - - -@enforce_types -def help(out_dir: Path=DATA_DIR) -> None: - """Print the ArchiveBox help message and usage""" - - from rich import print - from rich.panel import Panel - - all_subcommands = CLI_SUBCOMMANDS - COMMANDS_HELP_TEXT = '\n '.join( - f'[green]{cmd.ljust(20)}[/green] {func.__doc__}' - for cmd, func in all_subcommands.items() - if cmd in meta_cmds - ) + '\n\n ' + '\n '.join( - f'[green]{cmd.ljust(20)}[/green] {func.__doc__}' - for cmd, func in all_subcommands.items() - if cmd in setup_cmds - ) + '\n\n ' + '\n '.join( - f'[green]{cmd.ljust(20)}[/green] {func.__doc__}' - for cmd, func in all_subcommands.items() - if cmd in archive_cmds - ) + '\n\n ' + '\n '.join( - f'[green]{cmd.ljust(20)}[/green] {func.__doc__}' - for cmd, func in all_subcommands.items() - if cmd not in display_first - ) - - DOCKER_USAGE = ''' -[dodger_blue3]Docker Usage:[/dodger_blue3] - [grey53]# using Docker Compose:[/grey53] - [blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] - - [grey53]# using Docker:[/grey53] - [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] -''' if IN_DOCKER else '' - DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else '' - DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else '' - DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else '' - - print(f'''{DOCKER_USAGE} -[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT} - [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] - -[deep_sky_blue4]Commands:[/deep_sky_blue4] - {COMMANDS_HELP_TEXT} - -[deep_sky_blue4]Documentation:[/deep_sky_blue4] - [link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS} - [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link] - [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link] -''') - - - if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir(): - pretty_out_dir = str(out_dir).replace(str(Path('~').expanduser()), '~') - EXAMPLE_USAGE = f''' -[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] - -[violet]Hint:[/violet] [i]Common maintenance tasks:[/i] - [dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# make sure database is up-to-date (safe to run multiple times)[/grey53] - [dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53] - [dark_green]archivebox[/dark_green] [green]status[/green] [grey53]# get a health checkup report on your collection[/grey53] - [dark_green]archivebox[/dark_green] [green]update[/green] [grey53]# retry any previously failed or interrupted archiving tasks[/grey53] - -[violet]Hint:[/violet] [i]More example usage:[/i] - [dark_green]archivebox[/dark_green] [green]add[/green] --depth=1 "https://example.com/some/page" - [dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title - [dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss" - [dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53] -''' - print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.')) - else: - DATA_SETUP_HELP = '\n' - if IN_DOCKER: - DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n' - DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n' - DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n' - DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n' - DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n' - DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n' - DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n' - DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n' - DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n' - DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n' - DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n' - DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n' - DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n' - print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) - - -@enforce_types -def version(quiet: bool=False, - out_dir: Path=DATA_DIR, - binproviders: Optional[List[str]]=None, - binaries: Optional[List[str]]=None, - ) -> None: - """Print the ArchiveBox version and dependency information""" - - print(VERSION) - if quiet or '--version' in sys.argv: - return - - from rich.panel import Panel - from rich.console import Console - console = Console() - prnt = console.print - - from abx_plugin_default_binproviders import apt, brew, env - - from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME - from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID - from archivebox.config.paths import get_data_locations, get_code_locations - - LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED - - - # 0.7.1 - # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365 - # IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython - # FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644 - # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False - - p = platform.uname() - COMMIT_HASH = get_COMMIT_HASH() - prnt( - '[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION), - f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', - f'BUILD_TIME={get_BUILD_TIME()}', - ) - prnt( - f'IN_DOCKER={IN_DOCKER}', - f'IN_QEMU={SHELL_CONFIG.IN_QEMU}', - f'ARCH={p.machine}', - f'OS={p.system}', - f'PLATFORM={platform.platform()}', - f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''), - ) - OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount - DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() - prnt( - f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}', - f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}', - f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}', - f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', - f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', - ) - prnt( - f'DEBUG={SHELL_CONFIG.DEBUG}', - f'IS_TTY={SHELL_CONFIG.IS_TTY}', - f'SUDO={CONSTANTS.IS_ROOT}', - f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}', - f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}', - f'LDAP={LDAP_ENABLED}', - #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually - ) - prnt() - - if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)): - PANEL_TEXT = '\n'.join(( - # '', - # f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]', - '', - '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...', - ' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.', - '', - ' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]', - '', - )) - prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) - prnt() - return - - prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]') - failures = [] - BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES()) - for name, binary in list(BINARIES.items()): - if binary.name == 'archivebox': - continue - - # skip if the binary is not in the requested list of binaries - if binaries and binary.name not in binaries: - continue - - # skip if the binary is not supported by any of the requested binproviders - if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported): - continue - - err = None - try: - loaded_bin = binary.load() - except Exception as e: - err = e - loaded_bin = binary - provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] ' - if loaded_bin.abspath: - abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~') - if ' ' in abspath: - abspath = abspath.replace(' ', r'\ ') - else: - abspath = f'[red]{err}[/red]' - prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False) - if not loaded_bin.is_valid: - failures.append(loaded_bin.name) - - prnt() - prnt('[gold3][i] Package Managers:[/gold3]') - BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()) - for name, binprovider in list(BINPROVIDERS.items()): - err = None - - if binproviders and binprovider.name not in binproviders: - continue - - # TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN - loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew]) - - abspath = None - if loaded_bin.abspath: - abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~') - if ' ' in abspath: - abspath = abspath.replace(' ', r'\ ') - - PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~') - ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]' - provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]' - prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True) - - if not (binaries or binproviders): - # dont show source code / data dir info if we just want to get version info for a binary or binprovider - - prnt() - prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]') - for name, path in get_code_locations().items(): - prnt(printable_folder_status(name, path), overflow='ignore', crop=False) - - prnt() - if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): - prnt('[bright_yellow][i] Data locations:[/bright_yellow]') - for name, path in get_data_locations().items(): - prnt(printable_folder_status(name, path), overflow='ignore', crop=False) - - from archivebox.misc.checks import check_data_dir_permissions - - check_data_dir_permissions() - else: - prnt() - prnt('[red][i] Data locations:[/red] (not in a data directory)') - - prnt() - - if failures: - raise SystemExit(1) - raise SystemExit(0) - -@enforce_types -def run(subcommand: str, - subcommand_args: Optional[List[str]], - stdin: Optional[IO]=None, - out_dir: Path=DATA_DIR) -> None: - """Run a given ArchiveBox subcommand with the given list of args""" - run_subcommand( - subcommand=subcommand, - subcommand_args=subcommand_args, - stdin=stdin, - pwd=out_dir, - ) - - -@enforce_types -def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Path=DATA_DIR) -> None: - """Initialize a new ArchiveBox collection in the current directory""" - - from core.models import Snapshot - from rich import print - - # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK): - # print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr) - # print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr) - - is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_DATA_DIR) - existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE) - if is_empty and not existing_index: - print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]') - print('[green]----------------------------------------------------------------------[/green]') - elif existing_index: - # TODO: properly detect and print the existing version in current index as well - print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]') - print('[green]----------------------------------------------------------------------[/green]') - else: - if force: - print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]') - print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]') - else: - print( - ("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n" - " You must run init in a completely empty directory, or an existing data folder.\n\n" - " [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n" - " then run and run 'archivebox init' to pick up where you left off.\n\n" - " (Always make sure your data folder is backed up first before updating ArchiveBox)" - ) - ) - raise SystemExit(2) - - if existing_index: - print('\n[green][*] Verifying archive folder structure...[/green]') - else: - print('\n[green][+] Building archive folder structure...[/green]') - - print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...') - Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True) - Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True) - Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) - - print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...') - - # create the .archivebox_id file with a unique ID for this collection - from archivebox.config.paths import _get_collection_id - _get_collection_id(CONSTANTS.DATA_DIR, force_create=True) - - # create the ArchiveBox.conf file - write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY}) - - - if os.access(CONSTANTS.DATABASE_FILE, os.F_OK): - print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]') - else: - print('\n[green][+] Building main SQL index and running initial migrations...[/green]') - - for migration_line in apply_migrations(out_dir): - sys.stdout.write(f' {migration_line}\n') - - assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK) - print() - print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}') - - # from django.contrib.auth.models import User - # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists(): - # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI)) - # call_command("createsuperuser", interactive=True) - - print() - print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]') - - all_links = Snapshot.objects.none() - pending_links: Dict[str, Link] = {} - - if existing_index: - all_links = load_main_index(out_dir=out_dir, warn=False) - print(f' √ Loaded {all_links.count()} links from existing main index.') - - if quick: - print(' > Skipping full snapshot directory check (quick mode)') - else: - try: - # Links in data folders that dont match their timestamp - fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) - if fixed: - print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]') - if cant_fix: - print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]') - - # Links in JSON index but not in main index - orphaned_json_links = { - link.url: link - for link in parse_json_main_index(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_json_links: - pending_links.update(orphaned_json_links) - print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]') - - # Links in data dir indexes but not in main index - orphaned_data_dir_links = { - link.url: link - for link in parse_json_links_details(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_data_dir_links: - pending_links.update(orphaned_data_dir_links) - print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]') - - # Links in invalid/duplicate data dirs - invalid_folders = { - folder: link - for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() - } - if invalid_folders: - print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]') - print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items())) - print() - print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:') - print(' archivebox status') - print(' archivebox list --status=invalid') - - except (KeyboardInterrupt, SystemExit): - print(file=sys.stderr) - print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr) - print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr) - print(file=sys.stderr) - print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr) - print(' archivebox init --quick', file=sys.stderr) - raise SystemExit(1) - - write_main_index(list(pending_links.values()), out_dir=out_dir) - - print('\n[green]----------------------------------------------------------------------[/green]') - - from django.contrib.auth.models import User - - if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists(): - print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]') - User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD) - - if existing_index: - print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]') - else: - print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]') - - json_index = out_dir / CONSTANTS.JSON_INDEX_FILENAME - html_index = out_dir / CONSTANTS.HTML_INDEX_FILENAME - index_name = f"{date.today()}_index_old" - if os.access(json_index, os.F_OK): - json_index.rename(f"{index_name}.json") - if os.access(html_index, os.F_OK): - html_index.rename(f"{index_name}.html") - - CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) - CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) - CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True) - - from archivebox.config.common import STORAGE_CONFIG - STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True) - STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True) - - if install: - run_subcommand('install', pwd=out_dir) - - if Snapshot.objects.count() < 25: # hide the hints for experienced users - print() - print(' [violet]Hint:[/violet] To view your archive index, run:') - print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]') - print() - print(' To add new links, you can run:') - print(" archivebox add < ~/some/path/to/list_of_links.txt") - print() - print(' For more usage and examples, run:') - print(' archivebox help') - -@enforce_types -def status(out_dir: Path=DATA_DIR) -> None: - """Print out some info and statistics about the archive collection""" - - check_data_folder() - - from core.models import Snapshot - from django.contrib.auth import get_user_model - User = get_user_model() - - print('{green}[*] Scanning archive main index...{reset}'.format(**SHELL_CONFIG.ANSI)) - print(SHELL_CONFIG.ANSI['lightyellow'], f' {out_dir}/*', SHELL_CONFIG.ANSI['reset']) - num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') - size = printable_filesize(num_bytes) - print(f' Index size: {size} across {num_files} files') - print() - - links = load_main_index(out_dir=out_dir) - num_sql_links = links.count() - num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) - print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})') - print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)') - print() - print('{green}[*] Scanning archive data directories...{reset}'.format(**SHELL_CONFIG.ANSI)) - print(SHELL_CONFIG.ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', SHELL_CONFIG.ANSI['reset']) - num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) - size = printable_filesize(num_bytes) - print(f' Size: {size} across {num_files} files in {num_dirs} directories') - print(SHELL_CONFIG.ANSI['black']) - num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) - num_archived = len(get_archived_folders(links, out_dir=out_dir)) - num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) - print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') - print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') - print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') - - num_present = len(get_present_folders(links, out_dir=out_dir)) - num_valid = len(get_valid_folders(links, out_dir=out_dir)) - print() - print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') - print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') - - duplicate = get_duplicate_folders(links, out_dir=out_dir) - orphaned = get_orphaned_folders(links, out_dir=out_dir) - corrupted = get_corrupted_folders(links, out_dir=out_dir) - unrecognized = get_unrecognized_folders(links, out_dir=out_dir) - num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) - print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') - print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') - print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') - print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') - print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') - - print(SHELL_CONFIG.ANSI['reset']) - - if num_indexed: - print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)') - - if orphaned: - print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox init') - - if num_invalid: - print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox init') - - print() - print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**SHELL_CONFIG.ANSI)) - print(SHELL_CONFIG.ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', SHELL_CONFIG.ANSI['reset']) - users = get_admins().values_list('username', flat=True) - print(f' UI users {len(users)}: {", ".join(users)}') - last_login = User.objects.order_by('last_login').last() - if last_login: - print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}') - last_downloaded = Snapshot.objects.order_by('downloaded_at').last() - if last_downloaded: - print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}') - - if not users: - print() - print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox manage createsuperuser') - - print() - for snapshot in links.order_by('-downloaded_at')[:10]: - if not snapshot.downloaded_at: - continue - print( - SHELL_CONFIG.ANSI['black'], - ( - f' > {str(snapshot.downloaded_at)[:16]} ' - f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' - f'"{snapshot.title}": {snapshot.url}' - )[:SHELL_CONFIG.TERM_WIDTH], - SHELL_CONFIG.ANSI['reset'], - ) - print(SHELL_CONFIG.ANSI['black'], ' ...', SHELL_CONFIG.ANSI['reset']) - - -@enforce_types -def oneshot(url: str, extractors: str="", out_dir: Path=DATA_DIR, created_by_id: int | None=None) -> List[Link]: - """ - Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. - You can run this to archive single pages without needing to create a whole collection with archivebox init. - """ - oneshot_link, _ = parse_links_memory([url]) - if len(oneshot_link) > 1: - stderr( - '[X] You should pass a single url to the oneshot command', - color='red' - ) - raise SystemExit(2) - - methods = extractors.split(",") if extractors else ignore_methods(['title']) - archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, created_by_id=created_by_id) - return oneshot_link - -@enforce_types -def add(urls: Union[str, List[str]], - tag: str='', - depth: int=0, - update: bool=not ARCHIVING_CONFIG.ONLY_NEW, - update_all: bool=False, - index_only: bool=False, - overwrite: bool=False, - # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically - init: bool=False, - extractors: str="", - parser: str="auto", - created_by_id: int | None=None, - out_dir: Path=DATA_DIR) -> List[Link]: - """Add a new URL or list of URLs to your archive""" - - from core.models import Snapshot, Tag - # from workers.supervisor_util import start_cli_workers, tail_worker_logs - # from workers.tasks import bg_archive_link - - - assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' - - extractors = extractors.split(",") if extractors else [] - - if init: - run_subcommand('init', stdin=None, pwd=out_dir) - - # Load list of links from the existing index - check_data_folder() - - # worker = start_cli_workers() - - new_links: List[Link] = [] - all_links = load_main_index(out_dir=out_dir) - - log_importing_started(urls=urls, depth=depth, index_only=index_only) - if isinstance(urls, str): - # save verbatim stdin to sources - write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir) - elif isinstance(urls, list): - # save verbatim args to sources - write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) - - - new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser) - - # If we're going one level deeper, download each link and look for more links - new_links_depth = [] - if new_links and depth == 1: - log_crawl_started(new_links) - for new_link in new_links: - try: - downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) - new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) - except Exception as err: - stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red') - - imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) - - new_links = dedupe_links(all_links, imported_links) - - write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id) - all_links = load_main_index(out_dir=out_dir) - - tags = [ - Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0] - for name in tag.split(',') - if name.strip() - ] - if tags: - for link in imported_links: - snapshot = Snapshot.objects.get(url=link.url) - snapshot.tags.add(*tags) - snapshot.tags_str(nocache=True) - snapshot.save() - # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}') - - if index_only: - # mock archive all the links using the fake index_only extractor method in order to update their state - if overwrite: - archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id) - else: - archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id) - else: - # fully run the archive extractor methods for each link - archive_kwargs = { - "out_dir": out_dir, - "created_by_id": created_by_id, - } - if extractors: - archive_kwargs["methods"] = extractors - - stderr() - - ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S') - - if update: - stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green') - archive_links(imported_links, overwrite=overwrite, **archive_kwargs) - elif update_all: - stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green') - archive_links(all_links, overwrite=overwrite, **archive_kwargs) - elif overwrite: - stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green') - archive_links(imported_links, overwrite=True, **archive_kwargs) - elif new_links: - stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green') - archive_links(new_links, overwrite=False, **archive_kwargs) - - # tail_worker_logs(worker['stdout_logfile']) - - # if CAN_UPGRADE: - # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") - - return new_links - -@enforce_types -def remove(filter_str: Optional[str]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: str='exact', - snapshots: Optional[QuerySet]=None, - after: Optional[float]=None, - before: Optional[float]=None, - yes: bool=False, - delete: bool=False, - out_dir: Path=DATA_DIR) -> List[Link]: - """Remove the specified URLs from the archive""" - - check_data_folder() - - if snapshots is None: - if filter_str and filter_patterns: - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif not (filter_str or filter_patterns): - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin.', - color='red', - ) - stderr() - hint(('To remove all urls you can run:', - 'archivebox remove --filter-type=regex ".*"')) - stderr() - raise SystemExit(2) - elif filter_str: - filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] - - list_kwargs = { - "filter_patterns": filter_patterns, - "filter_type": filter_type, - "after": after, - "before": before, - } - if snapshots: - list_kwargs["snapshots"] = snapshots - - log_list_started(filter_patterns, filter_type) - timer = TimedProgress(360, prefix=' ') - try: - snapshots = list_links(**list_kwargs) - finally: - timer.end() - - - if not snapshots.exists(): - log_removal_finished(0, 0) - raise SystemExit(1) - - - log_links = [link.as_link() for link in snapshots] - log_list_finished(log_links) - log_removal_started(log_links, yes=yes, delete=delete) - - timer = TimedProgress(360, prefix=' ') - try: - for snapshot in snapshots: - if delete: - shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True) - finally: - timer.end() - - to_remove = snapshots.count() - - from .search import flush_search_index - - flush_search_index(snapshots=snapshots) - remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) - all_snapshots = load_main_index(out_dir=out_dir) - log_removal_finished(all_snapshots.count(), to_remove) - - return all_snapshots - -@enforce_types -def update(resume: Optional[float]=None, - only_new: bool=ARCHIVING_CONFIG.ONLY_NEW, - index_only: bool=False, - overwrite: bool=False, - filter_patterns_str: Optional[str]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: Optional[str]=None, - status: Optional[str]=None, - after: Optional[str]=None, - before: Optional[str]=None, - extractors: str="", - out_dir: Path=DATA_DIR) -> List[Link]: - """Import any new links from subscriptions and retry any previously failed/skipped links""" - - from core.models import ArchiveResult - from .search import index_links - # from workers.supervisor_util import start_cli_workers - - - check_data_folder() - # start_cli_workers() - new_links: List[Link] = [] # TODO: Remove input argument: only_new - - extractors = extractors.split(",") if extractors else [] - - # Step 1: Filter for selected_links - print('[*] Finding matching Snapshots to update...') - print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...') - matching_snapshots = list_links( - filter_patterns=filter_patterns, - filter_type=filter_type, - before=before, - after=after, - ) - print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...') - matching_folders = list_folders( - links=matching_snapshots, - status=status, - out_dir=out_dir, - ) - all_links = (link for link in matching_folders.values() if link) - print(' - Sorting by most unfinished -> least unfinished + date archived...') - all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp)) - - if index_only: - for link in all_links: - write_link_details(link, out_dir=out_dir, skip_sql_index=True) - index_links(all_links, out_dir=out_dir) - return all_links - - # Step 2: Run the archive methods for each link - to_archive = new_links if only_new else all_links - if resume: - to_archive = [ - link for link in to_archive - if link.timestamp >= str(resume) - ] - if not to_archive: - stderr('') - stderr(f'[√] Nothing found to resume after {resume}', color='green') - return all_links - - archive_kwargs = { - "out_dir": out_dir, - } - if extractors: - archive_kwargs["methods"] = extractors - - - archive_links(to_archive, overwrite=overwrite, **archive_kwargs) - - # Step 4: Re-write links index with updated titles, icons, and resources - all_links = load_main_index(out_dir=out_dir) - return all_links - -@enforce_types -def list_all(filter_patterns_str: Optional[str]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: str='exact', - status: Optional[str]=None, - after: Optional[float]=None, - before: Optional[float]=None, - sort: Optional[str]=None, - csv: Optional[str]=None, - json: bool=False, - html: bool=False, - with_headers: bool=False, - out_dir: Path=DATA_DIR): - """List, filter, and export information about archive entries""" - - check_data_folder() - - if filter_patterns and filter_patterns_str: - stderr( - '[X] You should either pass filter patterns as an arguments ' - 'or via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif filter_patterns_str: - filter_patterns = filter_patterns_str.split('\n') - - snapshots = list_links( - filter_patterns=filter_patterns, - filter_type=filter_type, - before=before, - after=after, - ) - - if sort: - snapshots = snapshots.order_by(sort) - - folders = list_folders( - links=snapshots, - status=status, - out_dir=out_dir, - ) - - if json: - output = generate_json_index_from_links(folders.values(), with_headers=with_headers) - elif html: - output = generate_index_from_links(folders.values(), with_headers=with_headers) - elif csv: - output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) - else: - output = printable_folders(folders, with_headers=with_headers) - print(output) - return output - - -@enforce_types -def list_links(snapshots: Optional[QuerySet]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: str='exact', - after: Optional[float]=None, - before: Optional[float]=None, - out_dir: Path=DATA_DIR) -> Iterable[Link]: - - check_data_folder() - - if snapshots: - all_snapshots = snapshots - else: - all_snapshots = load_main_index(out_dir=out_dir) - - if after is not None: - all_snapshots = all_snapshots.filter(timestamp__gte=after) - if before is not None: - all_snapshots = all_snapshots.filter(timestamp__lt=before) - if filter_patterns: - all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type) - - if not all_snapshots: - stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') - - return all_snapshots - -@enforce_types -def list_folders(links: List[Link], - status: str, - out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]: - - check_data_folder() - - STATUS_FUNCTIONS = { - "indexed": get_indexed_folders, - "archived": get_archived_folders, - "unarchived": get_unarchived_folders, - "present": get_present_folders, - "valid": get_valid_folders, - "invalid": get_invalid_folders, - "duplicate": get_duplicate_folders, - "orphaned": get_orphaned_folders, - "corrupted": get_corrupted_folders, - "unrecognized": get_unrecognized_folders, - } - - try: - return STATUS_FUNCTIONS[status](links, out_dir=out_dir) - except KeyError: - raise ValueError('Status not recognized.') - -@enforce_types -def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None: - """Automatically install all ArchiveBox dependencies and extras""" - - # if running as root: - # - run init to create index + lib dir - # - chown -R 911 DATA_DIR - # - install all binaries as root - # - chown -R 911 LIB_DIR - # else: - # - run init to create index + lib dir as current user - # - install all binaries as current user - # - recommend user re-run with sudo if any deps need to be installed as root - - from rich import print - - from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP - from archivebox.config.paths import get_or_create_working_lib_dir - - if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()): - run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in - - print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]') - - # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID - if IS_ROOT: - EUID = os.geteuid() - - # if we have sudo/root permissions, take advantage of them just while installing dependencies - print() - print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]') - print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].') - print() - - LIB_DIR = get_or_create_working_lib_dir() - - package_manager_names = ', '.join( - f'[yellow]{binprovider.name}[/yellow]' - for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())) - if not binproviders or (binproviders and binprovider.name in binproviders) - ) - print(f'[+] Setting up package managers {package_manager_names}...') - for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())): - if binproviders and binprovider.name not in binproviders: - continue - try: - binprovider.setup() - except Exception: - # it's ok, installing binaries below will automatically set up package managers as needed - # e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed - # the next package that depends on npm will automatically call binprovider.setup() during its own install - pass - - print() - - for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())): - if binary.name in ('archivebox', 'django', 'sqlite', 'python'): - # obviously must already be installed if we are running - continue - - if binaries and binary.name not in binaries: - continue - - providers = ' [grey53]or[/grey53] '.join( - provider.name for provider in binary.binproviders_supported - if not binproviders or (binproviders and provider.name in binproviders) - ) - if not providers: - continue - print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...') - try: - with SudoPermission(uid=0, fallback=True): - # print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})) - if binproviders: - providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported] - for binprovider_name in binproviders: - if binprovider_name not in providers_supported_by_binary: - continue - try: - if dry_run: - # always show install commands when doing a dry run - sys.stderr.write("\033[2;49;90m") # grey53 - result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) - sys.stderr.write("\033[00m\n") # reset - else: - loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False) - result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) - if result and result['loaded_version']: - break - except Exception as e: - print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]') - else: - if dry_run: - sys.stderr.write("\033[2;49;90m") # grey53 - binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) - sys.stderr.write("\033[00m\n") # reset - else: - loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run) - result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}) - if IS_ROOT and LIB_DIR: - with SudoPermission(uid=0): - if ARCHIVEBOX_USER == 0: - os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"') - else: - os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"') - except Exception as e: - print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]') - if binaries and len(binaries) == 1: - # if we are only installing a single binary, raise the exception so the user can see what went wrong - raise - - - from django.contrib.auth import get_user_model - User = get_user_model() - - if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): - stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green') - stderr(' archivebox manage createsuperuser') - # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) - - print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr) - - from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY - - extra_args = [] - if binproviders: - extra_args.append(f'--binproviders={",".join(binproviders)}') - if binaries: - extra_args.append(f'--binaries={",".join(binaries)}') - - proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=out_dir) - raise SystemExit(proc.returncode) - - -# backwards-compatibility: -setup = install - - -@enforce_types -def config(config_options_str: Optional[str]=None, - config_options: Optional[List[str]]=None, - get: bool=False, - set: bool=False, - search: bool=False, - reset: bool=False, - out_dir: Path=DATA_DIR) -> None: - """Get and set your ArchiveBox project configuration values""" - - from rich import print - - check_data_folder() - if config_options and config_options_str: - stderr( - '[X] You should either pass config values as an arguments ' - 'or via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif config_options_str: - config_options = config_options_str.split('\n') - - FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG() - CONFIGS = archivebox.pm.hook.get_CONFIGS() - - config_options = config_options or [] - - no_args = not (get or set or reset or config_options) - - matching_config = {} - if search: - if config_options: - config_options = [get_real_name(key) for key in config_options] - matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} - for config_section in CONFIGS.values(): - aliases = config_section.aliases - - for search_key in config_options: - # search all aliases in the section - for alias_key, key in aliases.items(): - if search_key.lower() in alias_key.lower(): - matching_config[key] = config_section.model_dump()[key] - - # search all keys and values in the section - for existing_key, value in config_section.model_dump().items(): - if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower(): - matching_config[existing_key] = value - - print(printable_config(matching_config)) - raise SystemExit(not matching_config) - elif get or no_args: - if config_options: - config_options = [get_real_name(key) for key in config_options] - matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} - failed_config = [key for key in config_options if key not in FLAT_CONFIG] - if failed_config: - stderr() - stderr('[X] These options failed to get', color='red') - stderr(' {}'.format('\n '.join(config_options))) - raise SystemExit(1) - else: - matching_config = FLAT_CONFIG - - print(printable_config(matching_config)) - raise SystemExit(not matching_config) - elif set: - new_config = {} - failed_options = [] - for line in config_options: - if line.startswith('#') or not line.strip(): - continue - if '=' not in line: - stderr('[X] Config KEY=VALUE must have an = sign in it', color='red') - stderr(f' {line}') - raise SystemExit(2) - - raw_key, val = line.split('=', 1) - raw_key = raw_key.upper().strip() - key = get_real_name(raw_key) - if key != raw_key: - stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow') - - if key in FLAT_CONFIG: - new_config[key] = val.strip() - else: - failed_options.append(line) - - if new_config: - before = FLAT_CONFIG - matching_config = write_config_file(new_config) - after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()} - print(printable_config(matching_config)) - - side_effect_changes = {} - for key, val in after.items(): - if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config): - side_effect_changes[key] = after[key] - # import ipdb; ipdb.set_trace() - - if side_effect_changes: - stderr() - stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow') - print(' {}'.format(printable_config(side_effect_changes, prefix=' '))) - if failed_options: - stderr() - stderr('[X] These options failed to set (check for typos):', color='red') - stderr(' {}'.format('\n '.join(failed_options))) - raise SystemExit(1) - elif reset: - stderr('[X] This command is not implemented yet.', color='red') - stderr(' Please manually remove the relevant lines from your config file:') - raise SystemExit(2) - else: - stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') - stderr(' archivebox config') - stderr(' archivebox config --get SOME_KEY') - stderr(' archivebox config --set SOME_KEY=SOME_VALUE') - raise SystemExit(2) - - -@enforce_types -def schedule(add: bool=False, - show: bool=False, - clear: bool=False, - foreground: bool=False, - run_all: bool=False, - quiet: bool=False, - every: Optional[str]=None, - tag: str='', - depth: int=0, - overwrite: bool=False, - update: bool=not ARCHIVING_CONFIG.ONLY_NEW, - import_path: Optional[str]=None, - out_dir: Path=DATA_DIR): - """Set ArchiveBox to regularly import URLs at specific times using cron""" - - check_data_folder() - from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY - from archivebox.config.permissions import USER - - Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) - - cron = CronTab(user=True) - cron = dedupe_cron_jobs(cron) - - if clear: - print(cron.remove_all(comment=CRON_COMMENT)) - cron.write() - raise SystemExit(0) - - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - - if every or add: - every = every or 'day' - quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s) - cmd = [ - 'cd', - quoted(out_dir), - '&&', - quoted(ARCHIVEBOX_BINARY.load().abspath), - *([ - 'add', - *(['--overwrite'] if overwrite else []), - *(['--update'] if update else []), - *([f'--tag={tag}'] if tag else []), - f'--depth={depth}', - f'"{import_path}"', - ] if import_path else ['update']), - '>>', - quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'), - '2>&1', - - ] - new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) - - if every in ('minute', 'hour', 'day', 'month', 'year'): - set_every = getattr(new_job.every(), every) - set_every() - elif CronSlices.is_valid(every): - new_job.setall(every) - else: - stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI)) - stderr(' It must be one of minute/hour/day/month') - stderr(' or a quoted cron-format schedule like:') - stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') - stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml') - raise SystemExit(1) - - cron = dedupe_cron_jobs(cron) - cron.write() - - total_runs = sum(j.frequency_per_year() for j in cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - - print() - print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) - print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) - if total_runs > 60 and not quiet: - stderr() - stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI)) - stderr(' Congrats on being an enthusiastic internet archiver! 👌') - stderr() - stderr(' Make sure you have enough storage space available to hold all the data.') - stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') - stderr('') - elif show: - if existing_jobs: - print('\n'.join(str(cmd) for cmd in existing_jobs)) - else: - stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI)) - stderr(' To schedule a new job, run:') - stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') - raise SystemExit(0) - - cron = CronTab(user=True) - cron = dedupe_cron_jobs(cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - - if foreground or run_all: - if not existing_jobs: - stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI)) - stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') - raise SystemExit(1) - - print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI)) - if run_all: - try: - for job in existing_jobs: - sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n') - sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}') - sys.stdout.flush() - job.run() - sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n') - except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) - raise SystemExit(1) - - if foreground: - try: - for job in existing_jobs: - print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}') - for result in cron.run_scheduler(): - print(result) - except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) - raise SystemExit(1) - - # if CAN_UPGRADE: - # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") - - -@enforce_types -def server(runserver_args: Optional[List[str]]=None, - reload: bool=False, - debug: bool=False, - init: bool=False, - quick_init: bool=False, - createsuperuser: bool=False, - daemonize: bool=False, - out_dir: Path=DATA_DIR) -> None: - """Run the ArchiveBox HTTP server""" - - from rich import print - - runserver_args = runserver_args or [] - - if init: - run_subcommand('init', stdin=None, pwd=out_dir) - print() - elif quick_init: - run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir) - print() - - if createsuperuser: - run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) - print() - - - check_data_folder() - - from django.core.management import call_command - from django.contrib.auth.models import User - - if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): - print() - # print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]') - print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:') - print(' [green]archivebox manage createsuperuser[/green]') - print() - - - host = '127.0.0.1' - port = '8000' - - try: - host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0] - if ':' in host_and_port: - host, port = host_and_port.split(':') - else: - if '.' in host_and_port: - host = host_and_port - else: - port = host_and_port - except IndexError: - pass - - print('[green][+] Starting ArchiveBox webserver...[/green]') - print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') - print(' > Writing ArchiveBox error log to ./logs/errors.log') - - if SHELL_CONFIG.DEBUG: - if not reload: - runserver_args.append('--noreload') # '--insecure' - call_command("runserver", *runserver_args) - else: - from workers.supervisor_util import start_server_workers - - print() - start_server_workers(host=host, port=port, daemonize=False) - print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") - - -@enforce_types -def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None: - """Run an ArchiveBox Django management command""" - - check_data_folder() - from django.core.management import execute_from_command_line - - if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY): - stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow') - stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') - stderr('') - - # import ipdb; ipdb.set_trace() - - execute_from_command_line(['manage.py', *(args or ['help'])]) - - -@enforce_types -def shell(out_dir: Path=DATA_DIR) -> None: - """Enter an interactive ArchiveBox Django shell""" - - check_data_folder() - - from django.core.management import call_command - call_command("shell_plus") - diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index bf4fae9a..8916bbad 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -24,7 +24,7 @@ def check_data_folder() -> None: from archivebox.config import CONSTANTS from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir - archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir() + archive_dir_exists = os.path.isdir(ARCHIVE_DIR) if not archive_dir_exists: print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr) print(f' {DATA_DIR}', file=sys.stderr) diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 4996a188..5e688961 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -12,7 +12,7 @@ from pathlib import Path from datetime import datetime, timezone from dataclasses import dataclass -from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING +from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING if TYPE_CHECKING: from ..index.schema import Link, ArchiveResult @@ -228,7 +228,7 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non print() -def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str='.'): +def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'): args = ' '.join(subcommand_args) version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format( now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), diff --git a/archivebox/misc/shell_welcome_message.py b/archivebox/misc/shell_welcome_message.py index 26314dc0..5e25050e 100644 --- a/archivebox/misc/shell_welcome_message.py +++ b/archivebox/misc/shell_welcome_message.py @@ -20,11 +20,9 @@ from datetime import datetime, timedelta # noqa from django.conf import settings # noqa from archivebox import CONSTANTS # noqa -from ..main import * # noqa -from ..cli import CLI_SUBCOMMANDS +from archivebox.cli import * # noqa CONFIG = archivebox.pm.hook.get_FLAT_CONFIG() -CLI_COMMAND_NAMES = ", ".join(CLI_SUBCOMMANDS.keys()) if __name__ == '__main__': # load the rich extension for ipython for pretty printing @@ -40,7 +38,7 @@ if __name__ == '__main__': prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]') prnt('[yellow4]# ArchiveBox Imports[/]') prnt('[yellow4]import archivebox[/]') - prnt('[yellow4]from archivebox.main import {}[/]'.format(CLI_COMMAND_NAMES)) + prnt('[yellow4]from archivebox.cli import *[/]') prnt() if console.width >= 80: diff --git a/archivebox/pkgs/abx/abx.py b/archivebox/pkgs/abx/abx.py index 481bb0ac..32dd4f6e 100644 --- a/archivebox/pkgs/abx/abx.py +++ b/archivebox/pkgs/abx/abx.py @@ -459,8 +459,8 @@ def load_plugins(plugins: Iterable[PluginId | ModuleType | Type] | Dict[PluginId PLUGINS_TO_LOAD = sorted(PLUGINS_TO_LOAD, key=lambda x: x['order']) for plugin_info in PLUGINS_TO_LOAD: - if '--version' not in sys.argv and '--help' not in sys.argv: - print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr) + # if '--version' not in sys.argv and '--help' not in sys.argv: + # print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr) pm.register(plugin_info['module']) LOADED_PLUGINS[plugin_info['id']] = plugin_info # print('\x1b[2K', end='\r', flush=True, file=sys.stderr) diff --git a/archivebox/workers/semaphores.py b/archivebox/workers/semaphores.py index 1be98ee3..ed583893 100644 --- a/archivebox/workers/semaphores.py +++ b/archivebox/workers/semaphores.py @@ -1,103 +1,103 @@ -import uuid -from functools import wraps -from django.db import connection, transaction -from django.utils import timezone -from huey.exceptions import TaskLockedException +# import uuid +# from functools import wraps +# from django.db import connection, transaction +# from django.utils import timezone +# from huey.exceptions import TaskLockedException -from archivebox.config import CONSTANTS +# from archivebox.config import CONSTANTS -class SqliteSemaphore: - def __init__(self, db_path, table_name, name, value=1, timeout=None): - self.db_path = db_path - self.table_name = table_name - self.name = name - self.value = value - self.timeout = timeout or 86400 # Set a max age for lock holders +# class SqliteSemaphore: +# def __init__(self, db_path, table_name, name, value=1, timeout=None): +# self.db_path = db_path +# self.table_name = table_name +# self.name = name +# self.value = value +# self.timeout = timeout or 86400 # Set a max age for lock holders - # Ensure the table exists - with connection.cursor() as cursor: - cursor.execute(f""" - CREATE TABLE IF NOT EXISTS {self.table_name} ( - id TEXT PRIMARY KEY, - name TEXT, - timestamp DATETIME - ) - """) +# # Ensure the table exists +# with connection.cursor() as cursor: +# cursor.execute(f""" +# CREATE TABLE IF NOT EXISTS {self.table_name} ( +# id TEXT PRIMARY KEY, +# name TEXT, +# timestamp DATETIME +# ) +# """) - def acquire(self, name=None): - name = name or str(uuid.uuid4()) - now = timezone.now() - expiration = now - timezone.timedelta(seconds=self.timeout) +# def acquire(self, name=None): +# name = name or str(uuid.uuid4()) +# now = timezone.now() +# expiration = now - timezone.timedelta(seconds=self.timeout) - with transaction.atomic(): - # Remove expired locks - with connection.cursor() as cursor: - cursor.execute(f""" - DELETE FROM {self.table_name} - WHERE name = %s AND timestamp < %s - """, [self.name, expiration]) +# with transaction.atomic(): +# # Remove expired locks +# with connection.cursor() as cursor: +# cursor.execute(f""" +# DELETE FROM {self.table_name} +# WHERE name = %s AND timestamp < %s +# """, [self.name, expiration]) - # Try to acquire the lock - with connection.cursor() as cursor: - cursor.execute(f""" - INSERT INTO {self.table_name} (id, name, timestamp) - SELECT %s, %s, %s - WHERE ( - SELECT COUNT(*) FROM {self.table_name} - WHERE name = %s - ) < %s - """, [name, self.name, now, self.name, self.value]) +# # Try to acquire the lock +# with connection.cursor() as cursor: +# cursor.execute(f""" +# INSERT INTO {self.table_name} (id, name, timestamp) +# SELECT %s, %s, %s +# WHERE ( +# SELECT COUNT(*) FROM {self.table_name} +# WHERE name = %s +# ) < %s +# """, [name, self.name, now, self.name, self.value]) - if cursor.rowcount > 0: - return name +# if cursor.rowcount > 0: +# return name - # If we couldn't acquire the lock, remove our attempted entry - with connection.cursor() as cursor: - cursor.execute(f""" - DELETE FROM {self.table_name} - WHERE id = %s AND name = %s - """, [name, self.name]) +# # If we couldn't acquire the lock, remove our attempted entry +# with connection.cursor() as cursor: +# cursor.execute(f""" +# DELETE FROM {self.table_name} +# WHERE id = %s AND name = %s +# """, [name, self.name]) - return None +# return None - def release(self, name): - with connection.cursor() as cursor: - cursor.execute(f""" - DELETE FROM {self.table_name} - WHERE id = %s AND name = %s - """, [name, self.name]) - return cursor.rowcount > 0 +# def release(self, name): +# with connection.cursor() as cursor: +# cursor.execute(f""" +# DELETE FROM {self.table_name} +# WHERE id = %s AND name = %s +# """, [name, self.name]) +# return cursor.rowcount > 0 -LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3' +# LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3' -def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None): - """ - Lock which can be acquired multiple times (default = 1). +# def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None): +# """ +# Lock which can be acquired multiple times (default = 1). - NOTE: no provisions are made for blocking, waiting, or notifying. This is - just a lock which can be acquired a configurable number of times. +# NOTE: no provisions are made for blocking, waiting, or notifying. This is +# just a lock which can be acquired a configurable number of times. - Example: +# Example: - # Allow up to 3 workers to run this task concurrently. If the task is - # locked, retry up to 2 times with a delay of 60s. - @huey.task(retries=2, retry_delay=60) - @lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3) - def my_task(): - ... - """ - sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout) - def decorator(fn): - @wraps(fn) - def inner(*args, **kwargs): - tid = sem.acquire() - if tid is None: - raise TaskLockedException(f'unable to acquire lock {lock_name}') - try: - return fn(*args, **kwargs) - finally: - sem.release(tid) - return inner - return decorator +# # Allow up to 3 workers to run this task concurrently. If the task is +# # locked, retry up to 2 times with a delay of 60s. +# @huey.task(retries=2, retry_delay=60) +# @lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3) +# def my_task(): +# ... +# """ +# sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout) +# def decorator(fn): +# @wraps(fn) +# def inner(*args, **kwargs): +# tid = sem.acquire() +# if tid is None: +# raise TaskLockedException(f'unable to acquire lock {lock_name}') +# try: +# return fn(*args, **kwargs) +# finally: +# sem.release(tid) +# return inner +# return decorator diff --git a/archivebox/workers/supervisor_util.py b/archivebox/workers/supervisord_util.py similarity index 100% rename from archivebox/workers/supervisor_util.py rename to archivebox/workers/supervisord_util.py diff --git a/archivebox/workers/tasks.py b/archivebox/workers/tasks.py index e6e3adc3..b81ee990 100644 --- a/archivebox/workers/tasks.py +++ b/archivebox/workers/tasks.py @@ -8,7 +8,7 @@ from django_huey import db_task, task from huey_monitor.models import TaskModel from huey_monitor.tqdm import ProcessInfo -from .supervisor_util import get_or_create_supervisord_process +from .supervisord_util import get_or_create_supervisord_process # @db_task(queue="commands", context=True, schedule=1) # def scheduler_tick(): diff --git a/pyproject.toml b/pyproject.toml index 55b8e56f..6e9adea2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,6 +115,8 @@ dependencies = [ "abx-plugin-mercury>=2024.10.28", "abx-plugin-htmltotext>=2024.10.28", "python-statemachine>=2.3.6", + "click>=8.1.7", + "rich-click>=1.8.4", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index ef3ccf90..9b31ed5a 100644 --- a/uv.lock +++ b/uv.lock @@ -658,6 +658,7 @@ dependencies = [ { name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels", extra = ["daphne"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "croniter", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "dateparser", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -688,6 +689,7 @@ dependencies = [ { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "rich-argparse", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "sonic-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "supervisor", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -784,6 +786,7 @@ requires-dist = [ { name = "atomicwrites", specifier = "==1.4.1" }, { name = "base32-crockford", specifier = "==0.3.0" }, { name = "channels", extras = ["daphne"], specifier = ">=4.1.0" }, + { name = "click", specifier = ">=8.1.7" }, { name = "croniter", specifier = ">=3.0.3" }, { name = "dateparser", specifier = ">=1.2.0" }, { name = "django", specifier = ">=5.1.1,<6.0" }, @@ -821,6 +824,7 @@ requires-dist = [ { name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" }, { name = "rich", specifier = ">=13.8.0" }, { name = "rich-argparse", specifier = ">=1.5.2" }, + { name = "rich-click", specifier = ">=1.8.4" }, { name = "setuptools", specifier = ">=74.1.0" }, { name = "sonic-client", specifier = ">=1.0.0" }, { name = "supervisor", specifier = ">=4.2.5" }, @@ -2806,6 +2810,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/45/54b95bb72bb17c27a7252bee5034955020b5869a33918b660ffc29cbf608/rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7", size = 20072 }, ] +[[package]] +name = "rich-click" +version = "1.8.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/f4/e48dc2850662526a26fb0961aacb0162c6feab934312b109b748ae4efee2/rich_click-1.8.4.tar.gz", hash = "sha256:0f49471f04439269d0e66a6f43120f52d11d594869a2a0be600cfb12eb0616b9", size = 38247 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/f3/72f93d8494ee641bde76bfe1208cf4abc44c6f9448673762f6077bc162d6/rich_click-1.8.4-py3-none-any.whl", hash = "sha256:2d2841b3cebe610d5682baa1194beaf78ab00c4fa31931533261b5eba2ee80b7", size = 35071 }, +] + [[package]] name = "ruff" version = "0.7.4"