From 230bf34e1469808bc27ed5f36dfcf8e0d2966d08 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:05:06 -0800 Subject: [PATCH 1/7] restore missing archivebox_config work --- archivebox/cli/archivebox_config.py | 166 +++++++++++----------------- 1 file changed, 62 insertions(+), 104 deletions(-) diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index 2eb2676f..897af5e0 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -1,48 +1,36 @@ #!/usr/bin/env python3 __package__ = 'archivebox.cli' -__command__ = 'archivebox config' import sys -import argparse -from pathlib import Path +import rich_click as click +from rich import print +from benedict import benedict -from typing import Optional, List, IO - -from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR -from archivebox.misc.logging_util import SmartFormatter, accept_stdin +from archivebox.misc.util import docstring, enforce_types +from archivebox.misc.toml_util import CustomTOMLEncoder - -# @enforce_types -def config(config_options_str: Optional[str]=None, - config_options: Optional[List[str]]=None, - get: bool=False, - set: bool=False, - search: bool=False, - reset: bool=False, - out_dir: Path=DATA_DIR) -> None: +@enforce_types +def config(*keys, + get: bool=False, + set: bool=False, + search: bool=False, + reset: bool=False, + **kwargs) -> None: """Get and set your ArchiveBox project configuration values""" - from rich import print + import archivebox + from archivebox.misc.checks import check_data_folder + from archivebox.misc.logging_util import printable_config + from archivebox.config.collection import load_all_config, write_config_file, get_real_name check_data_folder() - if config_options and config_options_str: - stderr( - '[X] You should either pass config values as an arguments ' - 'or via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif config_options_str: - config_options = config_options_str.split('\n') FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG() CONFIGS = archivebox.pm.hook.get_CONFIGS() - config_options = config_options or [] - + config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()]) no_args = not (get or set or reset or config_options) matching_config = {} @@ -51,36 +39,47 @@ def config(config_options_str: Optional[str]=None, config_options = [get_real_name(key) for key in config_options] matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} for config_section in CONFIGS.values(): - aliases = config_section.aliases + aliases = getattr(config_section, 'aliases', {}) for search_key in config_options: # search all aliases in the section for alias_key, key in aliases.items(): if search_key.lower() in alias_key.lower(): - matching_config[key] = config_section.model_dump()[key] + matching_config[key] = dict(config_section)[key] # search all keys and values in the section - for existing_key, value in config_section.model_dump().items(): + for existing_key, value in dict(config_section).items(): if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower(): matching_config[existing_key] = value print(printable_config(matching_config)) raise SystemExit(not matching_config) + elif get or no_args: if config_options: config_options = [get_real_name(key) for key in config_options] matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} failed_config = [key for key in config_options if key not in FLAT_CONFIG] if failed_config: - stderr() - stderr('[X] These options failed to get', color='red') - stderr(' {}'.format('\n '.join(config_options))) + print('\n[red][X] These options failed to get[/red]') + print(' {}'.format('\n '.join(config_options))) raise SystemExit(1) else: matching_config = FLAT_CONFIG - print(printable_config(matching_config)) + for config_section in CONFIGS.values(): + if hasattr(config_section, 'toml_section_header'): + print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]') + else: + print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]') + + kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config} + print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n')) + print('[grey53]################################################################[/grey53]') + + raise SystemExit(not matching_config) + elif set: new_config = {} failed_options = [] @@ -88,15 +87,15 @@ def config(config_options_str: Optional[str]=None, if line.startswith('#') or not line.strip(): continue if '=' not in line: - stderr('[X] Config KEY=VALUE must have an = sign in it', color='red') - stderr(f' {line}') + print('[red][X] Config KEY=VALUE must have an = sign in it[/red]') + print(f' {line}') raise SystemExit(2) raw_key, val = line.split('=', 1) raw_key = raw_key.upper().strip() key = get_real_name(raw_key) if key != raw_key: - stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow') + print(f'[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]') if key in FLAT_CONFIG: new_config[key] = val.strip() @@ -113,82 +112,41 @@ def config(config_options_str: Optional[str]=None, for key, val in after.items(): if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config): side_effect_changes[key] = after[key] - # import ipdb; ipdb.set_trace() if side_effect_changes: - stderr() - stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow') - print(' {}'.format(printable_config(side_effect_changes, prefix=' '))) + print(file=sys.stderr) + print('[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]', file=sys.stderr) + print(' {}'.format(printable_config(side_effect_changes, prefix=' ')), file=sys.stderr) + if failed_options: - stderr() - stderr('[X] These options failed to set (check for typos):', color='red') - stderr(' {}'.format('\n '.join(failed_options))) + print() + print('[red][X] These options failed to set (check for typos):[/red]') + print(' {}'.format('\n '.join(failed_options))) raise SystemExit(1) + elif reset: - stderr('[X] This command is not implemented yet.', color='red') - stderr(' Please manually remove the relevant lines from your config file:') + print('[red][X] This command is not implemented yet.[/red]') + print(' Please manually remove the relevant lines from your config file:') raise SystemExit(2) + else: - stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') - stderr(' archivebox config') - stderr(' archivebox config --get SOME_KEY') - stderr(' archivebox config --set SOME_KEY=SOME_VALUE') + print('[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]') + print(' archivebox config') + print(' archivebox config --get SOME_KEY') + print(' archivebox config --set SOME_KEY=SOME_VALUE') raise SystemExit(2) - - +@click.command() +@click.option('--search', is_flag=True, help='Search config KEYs, VALUEs, and ALIASES for the given term') +@click.option('--get', is_flag=True, help='Get the value for the given config KEYs') +@click.option('--set', is_flag=True, help='Set the given KEY=VALUE config values') +@click.option('--reset', is_flag=True, help='Reset the given KEY config values to their defaults') +@click.argument('KEY=VALUE', nargs=-1, type=str) @docstring(config.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=config.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - group = parser.add_mutually_exclusive_group() - parser.add_argument( - '--search', - action='store_true', - help="Search config KEYs, VALUEs, and ALIASES for the given term", - ) - group.add_argument( - '--get', #'-g', - action='store_true', - help="Get the value for the given config KEYs", - ) - group.add_argument( - '--set', #'-s', - action='store_true', - help="Set the given KEY=VALUE config values", - ) - group.add_argument( - '--reset', #'-s', - action='store_true', - help="Reset the given KEY config values to their defaults", - ) - parser.add_argument( - 'config_options', - nargs='*', - type=str, - help='KEY or KEY=VALUE formatted config values to get or set', - ) - command = parser.parse_args(args or ()) - - config_options_str = '' - if not command.config_options: - config_options_str = accept_stdin(stdin) - - config( - config_options_str=config_options_str, - config_options=command.config_options, - search=command.search, - get=command.get, - set=command.set, - reset=command.reset, - out_dir=Path(pwd) if pwd else DATA_DIR, - ) +def main(**kwargs) -> None: + config(**kwargs) if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main() From fe3320eff00dcc41cdc63ac40ad17f8b2b6f86ba Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:07:12 -0800 Subject: [PATCH 2/7] restore missing archivebox_remove work --- archivebox/cli/archivebox_remove.py | 141 ++++++++-------------------- 1 file changed, 40 insertions(+), 101 deletions(-) diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index 317dc792..0f03d686 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -3,53 +3,45 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox remove' -import sys -import argparse +import shutil from pathlib import Path -from typing import Optional, List, IO +from typing import Iterable + +import rich_click as click from django.db.models import QuerySet -from archivebox.misc.util import docstring from archivebox.config import DATA_DIR -from archivebox.misc.logging_util import SmartFormatter, accept_stdin from archivebox.index.schema import Link +from archivebox.config.django import setup_django +from archivebox.index import load_main_index +from archivebox.index.sql import remove_from_sql_main_index +from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.checks import check_data_folder +from archivebox.misc.logging_util import ( + log_list_started, + log_list_finished, + log_removal_started, + log_removal_finished, + TimedProgress, +) -def remove(filter_str: Optional[str]=None, - filter_patterns: Optional[list[str]]=None, - filter_type: str='exact', - snapshots: Optional[QuerySet]=None, - after: Optional[float]=None, - before: Optional[float]=None, - yes: bool=False, - delete: bool=False, - out_dir: Path=DATA_DIR) -> list[Link]: +@enforce_types +def remove(filter_patterns: Iterable[str]=(), + filter_type: str='exact', + snapshots: QuerySet | None=None, + after: float | None=None, + before: float | None=None, + yes: bool=False, + delete: bool=False, + out_dir: Path=DATA_DIR) -> Iterable[Link]: """Remove the specified URLs from the archive""" + setup_django() check_data_folder() - - if snapshots is None: - if filter_str and filter_patterns: - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif not (filter_str or filter_patterns): - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin.', - color='red', - ) - stderr() - hint(('To remove all urls you can run:', - 'archivebox remove --filter-type=regex ".*"')) - stderr() - raise SystemExit(2) - elif filter_str: - filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] + + from archivebox.cli.archivebox_search import list_links list_kwargs = { "filter_patterns": filter_patterns, @@ -67,12 +59,10 @@ def remove(filter_str: Optional[str]=None, finally: timer.end() - if not snapshots.exists(): log_removal_finished(0, 0) raise SystemExit(1) - log_links = [link.as_link() for link in snapshots] log_list_finished(log_links) log_removal_started(log_links, yes=yes, delete=delete) @@ -87,7 +77,7 @@ def remove(filter_str: Optional[str]=None, to_remove = snapshots.count() - from .search import flush_search_index + from archivebox.search import flush_search_index flush_search_index(snapshots=snapshots) remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) @@ -97,69 +87,18 @@ def remove(filter_str: Optional[str]=None, return all_snapshots +@click.command() +@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm') +@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index') +@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp') +@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp') +@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs') +@click.argument('filter_patterns', nargs=-1) @docstring(remove.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=remove.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--yes', # '-y', - action='store_true', - help='Remove links instantly without prompting to confirm.', - ) - parser.add_argument( - '--delete', # '-r', - action='store_true', - help=( - "In addition to removing the link from the index, " - "also delete its archived content and metadata folder." - ), - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="List only URLs bookmarked before the given timestamp.", - default=None, - ) - parser.add_argument( - '--after', #'-a', - type=float, - help="List only URLs bookmarked after the given timestamp.", - default=None, - ) - parser.add_argument( - '--filter-type', - type=str, - choices=('exact', 'substring', 'domain', 'regex','tag'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - help='URLs matching this filter pattern will be removed from the index.' - ) - command = parser.parse_args(args or ()) - - filter_str = None - if not command.filter_patterns: - filter_str = accept_stdin(stdin) +def main(**kwargs): + """Remove the specified URLs from the archive""" + remove(**kwargs) - remove( - filter_str=filter_str, - filter_patterns=command.filter_patterns, - filter_type=command.filter_type, - before=command.before, - after=command.after, - yes=command.yes, - delete=command.delete, - out_dir=Path(pwd) if pwd else DATA_DIR, - ) - if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main() From 0f536ff18badbcb453b7ebf6a2150dea9b143dc4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:07:55 -0800 Subject: [PATCH 3/7] restore missing archivebox_schedule work --- archivebox/cli/archivebox_schedule.py | 214 ++++++++------------------ 1 file changed, 66 insertions(+), 148 deletions(-) diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index d2f85c84..561d0d2d 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -1,38 +1,43 @@ #!/usr/bin/env python3 __package__ = 'archivebox.cli' -__command__ = 'archivebox schedule' import sys -import argparse from pathlib import Path -from typing import Optional, List, IO -from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR -from archivebox.misc.logging_util import SmartFormatter, reject_stdin +import rich_click as click +from rich import print + +from archivebox.misc.util import enforce_types, docstring +from archivebox.config import DATA_DIR, CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG +from archivebox.config.permissions import USER -# @enforce_types +CRON_COMMENT = 'ArchiveBox' + + +@enforce_types def schedule(add: bool=False, - show: bool=False, - clear: bool=False, - foreground: bool=False, - run_all: bool=False, - quiet: bool=False, - every: Optional[str]=None, - tag: str='', - depth: int=0, - overwrite: bool=False, - update: bool=not ARCHIVING_CONFIG.ONLY_NEW, - import_path: Optional[str]=None, - out_dir: Path=DATA_DIR): + show: bool=False, + clear: bool=False, + foreground: bool=False, + run_all: bool=False, + quiet: bool=False, + every: str | None=None, + tag: str='', + depth: int | str=0, + overwrite: bool=False, + update: bool=not ARCHIVING_CONFIG.ONLY_NEW, + import_path: str | None=None, + out_dir: Path=DATA_DIR) -> None: """Set ArchiveBox to regularly import URLs at specific times using cron""" + + depth = int(depth) - check_data_folder() + from crontab import CronTab, CronSlices + from archivebox.misc.system import dedupe_cron_jobs from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY - from archivebox.config.permissions import USER Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) @@ -65,7 +70,6 @@ def schedule(add: bool=False, '>>', quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'), '2>&1', - ] new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) @@ -75,50 +79,47 @@ def schedule(add: bool=False, elif CronSlices.is_valid(every): new_job.setall(every) else: - stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI)) - stderr(' It must be one of minute/hour/day/month') - stderr(' or a quoted cron-format schedule like:') - stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') - stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml') + print('[red]\\[X] Got invalid timeperiod for cron task.[/red]') + print(' It must be one of minute/hour/day/month') + print(' or a quoted cron-format schedule like:') + print(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') + print(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(1) cron = dedupe_cron_jobs(cron) + print(cron) cron.write() total_runs = sum(j.frequency_per_year() for j in cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) + existing_jobs = list(cron.find_command('archivebox')) print() - print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) + print('[green]\\[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).[/green]'.format(USER, len(existing_jobs))) print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) if total_runs > 60 and not quiet: - stderr() - stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI)) - stderr(' Congrats on being an enthusiastic internet archiver! 👌') - stderr() - stderr(' Make sure you have enough storage space available to hold all the data.') - stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') - stderr('') + print() + print('[yellow]\\[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.[/yellow]'.format(total_runs)) + print(' Congrats on being an enthusiastic internet archiver! 👌') + print() + print(' [violet]Make sure you have enough storage space available to hold all the data.[/violet]') + print(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') + print() elif show: if existing_jobs: print('\n'.join(str(cmd) for cmd in existing_jobs)) else: - stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI)) - stderr(' To schedule a new job, run:') - stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') + print('[red]\\[X] There are no ArchiveBox cron jobs scheduled for your user ({}).[/red]'.format(USER)) + print(' To schedule a new job, run:') + print(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(0) - cron = CronTab(user=True) - cron = dedupe_cron_jobs(cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - if foreground or run_all: if not existing_jobs: - stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI)) - stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') + print('[red]\\[X] You must schedule some jobs first before running in foreground mode.[/red]') + print(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(1) - print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI)) + print('[green]\\[*] Running {} ArchiveBox jobs in foreground task scheduler...[/green]'.format(len(existing_jobs))) if run_all: try: for job in existing_jobs: @@ -128,7 +129,7 @@ def schedule(add: bool=False, job.run() sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n') except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) + print('\n[green]\\[√] Stopped.[/green] (Ctrl+C)') raise SystemExit(1) if foreground: @@ -138,111 +139,28 @@ def schedule(add: bool=False, for result in cron.run_scheduler(): print(result) except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) + print('\n[green]\\[√] Stopped.[/green] (Ctrl+C)') raise SystemExit(1) - # if CAN_UPGRADE: - # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") - - +@click.command() +@click.option('--quiet', '-q', is_flag=True, help="Don't warn about storage space") +@click.option('--add', is_flag=True, help='Add a new scheduled ArchiveBox update job to cron') +@click.option('--every', type=str, help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")') +@click.option('--tag', '-t', default='', help='Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3') +@click.option('--depth', type=click.Choice(['0', '1']), default='0', help='Depth to archive to [0] or 1') +@click.option('--overwrite', is_flag=True, help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots') +@click.option('--update', is_flag=True, help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults') +@click.option('--clear', is_flag=True, help='Stop all ArchiveBox scheduled runs (remove cron jobs)') +@click.option('--show', is_flag=True, help='Print a list of currently active ArchiveBox cron jobs') +@click.option('--foreground', '-f', is_flag=True, help='Launch ArchiveBox scheduler as a long-running foreground task instead of using cron') +@click.option('--run-all', is_flag=True, help='Run all the scheduled jobs once immediately, independent of their configured schedules') +@click.argument('import_path', required=False) @docstring(schedule.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=schedule.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--quiet', '-q', - action='store_true', - help=("Don't warn about storage space."), - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--add', # '-a', - action='store_true', - help='Add a new scheduled ArchiveBox update job to cron', - ) - parser.add_argument( - '--every', # '-e', - type=str, - default=None, - help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")', - ) - parser.add_argument( - '--tag', '-t', - type=str, - default='', - help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3", - ) - parser.add_argument( - '--depth', # '-d', - type=int, - choices=[0, 1], - default=0, - help='Depth to archive to [0] or 1, see "add" command help for more info', - ) - parser.add_argument( - '--overwrite', - action='store_true', - help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots', - ) - parser.add_argument( - '--update', - action='store_true', - help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults', - ) - group.add_argument( - '--clear', # '-c' - action='store_true', - help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"), - ) - group.add_argument( - '--show', # '-s' - action='store_true', - help=("Print a list of currently active ArchiveBox cron jobs"), - ) - group.add_argument( - '--foreground', '-f', - action='store_true', - help=("Launch ArchiveBox scheduler as a long-running foreground task " - "instead of using cron."), - ) - group.add_argument( - '--run-all', # '-a', - action='store_true', - help=("Run all the scheduled jobs once immediately, independent of " - "their configured schedules, can be used together with --foreground"), - ) - parser.add_argument( - 'import_path', - nargs='?', - type=str, - default=None, - help=("Check this path and import any new links on every run " - "(can be either local file or remote URL)"), - ) - command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - schedule( - add=command.add, - show=command.show, - clear=command.clear, - foreground=command.foreground, - run_all=command.run_all, - quiet=command.quiet, - every=command.every, - tag=command.tag, - depth=command.depth, - overwrite=command.overwrite, - update=command.update, - import_path=command.import_path, - out_dir=Path(pwd) if pwd else DATA_DIR, - ) +def main(**kwargs): + """Set ArchiveBox to regularly import URLs at specific times using cron""" + schedule(**kwargs) if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main() From 52446b86baad6915ca81b7017bbb63363c78a729 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:08:41 -0800 Subject: [PATCH 4/7] restore missing archivebox_status work --- archivebox/cli/archivebox_status.py | 98 ++++++++++++++--------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index 9b80d1d8..2785f5c8 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -1,34 +1,44 @@ #!/usr/bin/env python3 __package__ = 'archivebox.cli' -__command__ = 'archivebox status' -import sys -import argparse from pathlib import Path -from typing import Optional, List, IO +import rich_click as click from rich import print -from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR -from archivebox.misc.logging_util import SmartFormatter, reject_stdin +from archivebox.misc.util import enforce_types, docstring +from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR +from archivebox.config.common import SHELL_CONFIG +from archivebox.index.json import parse_json_links_details +from archivebox.index import ( + load_main_index, + get_indexed_folders, + get_archived_folders, + get_invalid_folders, + get_unarchived_folders, + get_present_folders, + get_valid_folders, + get_duplicate_folders, + get_orphaned_folders, + get_corrupted_folders, + get_unrecognized_folders, +) +from archivebox.misc.system import get_dir_size +from archivebox.misc.logging_util import printable_filesize - - -# @enforce_types +@enforce_types def status(out_dir: Path=DATA_DIR) -> None: """Print out some info and statistics about the archive collection""" - check_data_folder() - - from core.models import Snapshot from django.contrib.auth import get_user_model + from archivebox.index.sql import get_admins + from core.models import Snapshot User = get_user_model() - print('{green}[*] Scanning archive main index...{reset}'.format(**SHELL_CONFIG.ANSI)) - print(SHELL_CONFIG.ANSI['lightyellow'], f' {out_dir}/*', SHELL_CONFIG.ANSI['reset']) + print('[green]\\[*] Scanning archive main index...[/green]') + print(f'[yellow] {out_dir}/*[/yellow]') num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') size = printable_filesize(num_bytes) print(f' Index size: {size} across {num_files} files') @@ -40,12 +50,12 @@ def status(out_dir: Path=DATA_DIR) -> None: print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})') print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)') print() - print('{green}[*] Scanning archive data directories...{reset}'.format(**SHELL_CONFIG.ANSI)) - print(SHELL_CONFIG.ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', SHELL_CONFIG.ANSI['reset']) + print('[green]\\[*] Scanning archive data directories...[/green]') + print(f'[yellow] {ARCHIVE_DIR}/*[/yellow]') num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files in {num_dirs} directories') - print(SHELL_CONFIG.ANSI['black']) + num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) num_archived = len(get_archived_folders(links, out_dir=out_dir)) num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) @@ -57,36 +67,34 @@ def status(out_dir: Path=DATA_DIR) -> None: num_valid = len(get_valid_folders(links, out_dir=out_dir)) print() print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') - print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') + print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})') duplicate = get_duplicate_folders(links, out_dir=out_dir) orphaned = get_orphaned_folders(links, out_dir=out_dir) corrupted = get_corrupted_folders(links, out_dir=out_dir) unrecognized = get_unrecognized_folders(links, out_dir=out_dir) num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) - print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') + print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})') print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') - - print(SHELL_CONFIG.ANSI['reset']) if num_indexed: - print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)') + print(' [violet]Hint:[/violet] You can list link data directories by status like so:') + print(' [green]archivebox list --status= (e.g. indexed, corrupted, archived, etc.)[/green]') if orphaned: - print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox init') + print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:') + print(' [green]archivebox init[/green]') if num_invalid: - print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox init') + print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:') + print(' [green]archivebox init[/green]') print() - print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**SHELL_CONFIG.ANSI)) - print(SHELL_CONFIG.ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', SHELL_CONFIG.ANSI['reset']) + print('[green]\\[*] Scanning recent archive changes and user logins:[/green]') + print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]') users = get_admins().values_list('username', flat=True) print(f' UI users {len(users)}: {", ".join(users)}') last_login = User.objects.order_by('last_login').last() @@ -98,39 +106,31 @@ def status(out_dir: Path=DATA_DIR) -> None: if not users: print() - print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**SHELL_CONFIG.ANSI)) - print(' archivebox manage createsuperuser') + print(' [violet]Hint:[/violet] You can create an admin user by running:') + print(' [green]archivebox manage createsuperuser[/green]') print() for snapshot in links.order_by('-downloaded_at')[:10]: if not snapshot.downloaded_at: continue print( - SHELL_CONFIG.ANSI['black'], + '[grey53] ' + ( f' > {str(snapshot.downloaded_at)[:16]} ' f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' f'"{snapshot.title}": {snapshot.url}' - )[:SHELL_CONFIG.TERM_WIDTH], - SHELL_CONFIG.ANSI['reset'], + )[:SHELL_CONFIG.TERM_WIDTH] + + '[grey53]', ) - print(SHELL_CONFIG.ANSI['black'], ' ...', SHELL_CONFIG.ANSI['reset']) - + print('[grey53] ...') +@click.command() @docstring(status.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=status.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - status(out_dir=Path(pwd) if pwd else DATA_DIR) +def main(**kwargs): + """Print out some info and statistics about the archive collection""" + status(**kwargs) if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + main() From f8e2f7c753c9807821113b2488f644b766bde308 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:09:19 -0800 Subject: [PATCH 5/7] restore missing archivebox_update work --- archivebox/cli/archivebox_update.py | 196 +++++++++------------------- 1 file changed, 65 insertions(+), 131 deletions(-) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 9694b6e6..97185ff7 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 __package__ = 'archivebox.cli' -__command__ = 'archivebox update' -import sys -import argparse -from typing import List, Optional, IO -from archivebox.misc.util import docstring +import rich_click as click + +from typing import Iterable + +from archivebox.misc.util import enforce_types, docstring from archivebox.index import ( LINK_FILTERS, get_indexed_folders, @@ -21,8 +21,66 @@ from archivebox.index import ( get_corrupted_folders, get_unrecognized_folders, ) -from archivebox.misc.logging_util import SmartFormatter, accept_stdin -# from ..main import update + + +@enforce_types +def update(filter_patterns: Iterable[str]=(), + only_new: bool=False, + index_only: bool=False, + resume: float | None=None, + overwrite: bool=False, + before: float | None=None, + after: float | None=None, + status: str='indexed', + filter_type: str='exact', + extract: str="") -> None: + """Import any new links from subscriptions and retry any previously failed/skipped links""" + + from archivebox.config.django import setup_django + setup_django() + + from workers.orchestrator import Orchestrator + orchestrator = Orchestrator(exit_on_idle=False) + orchestrator.start() + + +@click.command() +@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating") +@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content") +@click.option('--resume', type=float, help='Resume the update process from a given timestamp') +@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)') +@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp") +@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp") +@click.option('--status', type=click.Choice([ + 'indexed', 'archived', 'unarchived', + 'present', 'valid', 'invalid', + 'duplicate', 'orphaned', 'corrupted', 'unrecognized' +]), default='indexed', help=f''' +Update only links or data directories that have the given status: + indexed {get_indexed_folders.__doc__} (the default) + archived {get_archived_folders.__doc__} + unarchived {get_unarchived_folders.__doc__} + + present {get_present_folders.__doc__} + valid {get_valid_folders.__doc__} + invalid {get_invalid_folders.__doc__} + + duplicate {get_duplicate_folders.__doc__} + orphaned {get_orphaned_folders.__doc__} + corrupted {get_corrupted_folders.__doc__} + unrecognized {get_unrecognized_folders.__doc__} +''') +@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs') +@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...') +@click.argument('filter_patterns', nargs=-1) +@docstring(update.__doc__) +def main(**kwargs): + """Import any new links from subscriptions and retry any previously failed/skipped links""" + update(**kwargs) + + +if __name__ == '__main__': + main() @@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin # # Step 4: Re-write links index with updated titles, icons, and resources # all_links = load_main_index(out_dir=out_dir) # return all_links - - - - - -def update(): - """Import any new links from subscriptions and retry any previously failed/skipped links""" - from archivebox.config.django import setup_django - setup_django() - - from workers.orchestrator import Orchestrator - orchestrator = Orchestrator(exit_on_idle=False) - orchestrator.start() - - -@docstring(update.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=update.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--only-new', #'-n', - action='store_true', - help="Don't attempt to retry previously skipped/failed links when updating", - ) - parser.add_argument( - '--index-only', #'-o', - action='store_true', - help="Update the main index without archiving any content", - ) - parser.add_argument( - '--resume', #'-r', - type=float, - help='Resume the update process from a given timestamp', - default=None, - ) - parser.add_argument( - '--overwrite', #'-x', - action='store_true', - help='Ignore existing archived content and overwrite with new versions (DANGEROUS)', - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="Update only links bookmarked before the given timestamp.", - default=None, - ) - parser.add_argument( - '--after', #'-a', - type=float, - help="Update only links bookmarked after the given timestamp.", - default=None, - ) - parser.add_argument( - '--status', - type=str, - choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'), - default='indexed', - help=( - 'Update only links or data directories that have the given status\n' - f' indexed {get_indexed_folders.__doc__} (the default)\n' - f' archived {get_archived_folders.__doc__}\n' - f' unarchived {get_unarchived_folders.__doc__}\n' - '\n' - f' present {get_present_folders.__doc__}\n' - f' valid {get_valid_folders.__doc__}\n' - f' invalid {get_invalid_folders.__doc__}\n' - '\n' - f' duplicate {get_duplicate_folders.__doc__}\n' - f' orphaned {get_orphaned_folders.__doc__}\n' - f' corrupted {get_corrupted_folders.__doc__}\n' - f' unrecognized {get_unrecognized_folders.__doc__}\n' - ) - ) - parser.add_argument( - '--filter-type', '-t', - type=str, - choices=(*LINK_FILTERS.keys(), 'search'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - default=None, - help='Update only URLs matching these filter patterns.' - ) - parser.add_argument( - "--extract", - type=str, - help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration", - default="" - ) - command = parser.parse_args(args or ()) - - filter_patterns_str = None - if not command.filter_patterns: - filter_patterns_str = accept_stdin(stdin) - - update() - - # update( - # resume=command.resume, - # only_new=command.only_new, - # index_only=command.index_only, - # overwrite=command.overwrite, - # filter_patterns_str=filter_patterns_str, - # filter_patterns=command.filter_patterns, - # filter_type=command.filter_type, - # status=command.status, - # after=command.after, - # before=command.before, - # out_dir=Path(pwd) if pwd else DATA_DIR, - # extractors=command.extract, - # ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) From 6b47510f70f68d33cc7e57d81cc302c4a5507a0e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:24:12 -0800 Subject: [PATCH 6/7] always pre-setup binproviders --- .../abx_plugin_default_binproviders.py | 3 +++ .../pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py | 3 +++ .../pkgs/abx-plugin-pip/abx_plugin_pip/binproviders.py | 5 +++++ .../abx_plugin_playwright/binproviders.py | 1 + .../abx_plugin_puppeteer/binproviders.py | 2 +- 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py b/archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py index 9dca52ef..53017bb7 100644 --- a/archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py +++ b/archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py @@ -12,6 +12,9 @@ from abx_pkg import ( apt = APT_BINPROVIDER = AptProvider() brew = BREW_BINPROVIDER = BrewProvider() env = ENV_BINPROVIDER = EnvProvider() +apt.setup() +brew.setup() +env.setup() @abx.hookimpl(tryfirst=True) diff --git a/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py b/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py index 400c97c2..e2b0dd70 100644 --- a/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py +++ b/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py @@ -36,3 +36,6 @@ class LibNpmBinProvider(NpmProvider): SYS_NPM_BINPROVIDER = SystemNpmBinProvider() LIB_NPM_BINPROVIDER = LibNpmBinProvider() npm = LIB_NPM_BINPROVIDER + +LIB_NPM_BINPROVIDER.setup() +SYS_NPM_BINPROVIDER.setup() diff --git a/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binproviders.py b/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binproviders.py index 44e2c6b2..3a036eea 100644 --- a/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binproviders.py +++ b/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binproviders.py @@ -68,6 +68,11 @@ VENV_PIP_BINPROVIDER = VenvPipBinProvider() LIB_PIP_BINPROVIDER = LibPipBinProvider() pip = LIB_PIP_BINPROVIDER +SYS_PIP_BINPROVIDER.setup() +PIPX_PIP_BINPROVIDER.setup() +VENV_PIP_BINPROVIDER.setup() +LIB_PIP_BINPROVIDER.setup() + # ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path) assert VENV_PIP_BINPROVIDER.pip_venv is not None assert LIB_PIP_BINPROVIDER.pip_venv is not None diff --git a/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binproviders.py b/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binproviders.py index 467e938c..1938e08f 100644 --- a/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binproviders.py +++ b/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binproviders.py @@ -164,3 +164,4 @@ class PlaywrightBinProvider(BinProvider): return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip() PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider() +PLAYWRIGHT_BINPROVIDER.setup() diff --git a/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binproviders.py b/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binproviders.py index e65855ae..c502b22d 100644 --- a/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binproviders.py +++ b/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binproviders.py @@ -115,7 +115,7 @@ class PuppeteerBinProvider(BinProvider): return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip() PUPPETEER_BINPROVIDER = PuppeteerBinProvider() - +PUPPETEER_BINPROVIDER.setup() # ALTERNATIVE INSTALL METHOD using Ansible: # install_playbook = self.plugin_dir / 'install_puppeteer.yml' From b852951c5850440e475229868645f5286d8e0465 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 19 Nov 2024 05:27:35 -0800 Subject: [PATCH 7/7] fix cli loading edge case where setup_django wasnt running when it should --- archivebox/cli/__init__.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 3527d63c..18aa277c 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -51,6 +51,10 @@ class ArchiveBoxGroup(click.Group): 'export': 'search', } + @classmethod + def get_canonical_name(cls, cmd_name): + return cls.renamed_commands.get(cmd_name, cmd_name) + def get_command(self, ctx, cmd_name): # handle renamed commands @@ -92,18 +96,26 @@ class ArchiveBoxGroup(click.Group): def cli(ctx, help=False): """ArchiveBox: The self-hosted internet archive""" + subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand) + # if --help is passed or no subcommand is given, show custom help message if help or ctx.invoked_subcommand is None: ctx.invoke(ctx.command.get_command(ctx, 'help')) # if the subcommand is in the archive_commands dict and is not 'manage', # then we need to set up the django environment and check that we're in a valid data folder - if ctx.invoked_subcommand in ArchiveBoxGroup.archive_commands and ctx.invoked_subcommand != 'manage': + if subcommand in ArchiveBoxGroup.archive_commands: # print('SETUP DJANGO AND CHECK DATA FOLDER') - from archivebox.config.django import setup_django - from archivebox.misc.checks import check_data_folder - setup_django() - check_data_folder() + try: + from archivebox.config.django import setup_django + from archivebox.misc.checks import check_data_folder + setup_django() + check_data_folder() + except Exception as e: + print(f'[red][X] Error setting up Django or checking data folder: {e}[/red]', file=sys.stderr) + if subcommand not in ('manage', 'shell'): # not all management commands need django to be setup beforehand + raise + def main(args=None, prog_name=None): # show `docker run archivebox xyz` in help messages if running in docker