move main funcs into cli files and switch to using click for CLI

This commit is contained in:
Nick Sweeting 2024-11-19 00:18:51 -08:00
parent 569081a9eb
commit 328eb98a38
No known key found for this signature in database
35 changed files with 1885 additions and 2296 deletions

View file

@ -51,6 +51,7 @@ from .pkgs import load_vendored_pkgs # noqa
load_vendored_pkgs() load_vendored_pkgs()
# print('DONE LOADING VENDORED LIBRARIES') # print('DONE LOADING VENDORED LIBRARIES')
# print('LOADING ABX PLUGIN SPECIFICATIONS')
# Load ABX Plugin Specifications + Default Implementations # Load ABX Plugin Specifications + Default Implementations
import abx # noqa import abx # noqa
import abx_spec_archivebox # noqa import abx_spec_archivebox # noqa
@ -74,7 +75,7 @@ abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods # Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm) abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
pm = abx.pm pm = abx.pm
# print('DONE LOADING ABX PLUGIN SPECIFICATIONS')
# Load all pip-installed ABX-compatible plugins # Load all pip-installed ABX-compatible plugins
ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx') ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
@ -94,7 +95,9 @@ USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
# Import all plugins and register them with ABX Plugin Manager # Import all plugins and register them with ABX Plugin Manager
ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS} ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
# print('LOADING ALL PLUGINS')
LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS) LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
# print('DONE LOADING ALL PLUGINS')
# Setup basic config, constants, paths, and version # Setup basic config, constants, paths, and version
from .config.constants import CONSTANTS # noqa from .config.constants import CONSTANTS # noqa

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""This is the main entry point for the ArchiveBox CLI.""" """This is the entrypoint for python -m archivebox ..."""
__package__ = 'archivebox' __package__ = 'archivebox'
import archivebox # noqa # make sure monkey patches are applied before anything else import archivebox # noqa # make sure monkey patches are applied before anything else
@ -15,5 +15,4 @@ ASCII_LOGO_MINI = r"""
/_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\ /_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
""" """
if __name__ == '__main__': main(args=sys.argv[1:], stdin=sys.stdin)
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -6,13 +6,6 @@ from enum import Enum
from ninja import Router, Schema from ninja import Router, Schema
from archivebox.main import (
add,
remove,
update,
list_all,
schedule,
)
from archivebox.misc.util import ansi_to_html from archivebox.misc.util import ansi_to_html
from archivebox.config.common import ARCHIVING_CONFIG from archivebox.config.common import ARCHIVING_CONFIG
@ -60,13 +53,11 @@ class AddCommandSchema(Schema):
urls: List[str] urls: List[str]
tag: str = "" tag: str = ""
depth: int = 0 depth: int = 0
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
update_all: bool = False
index_only: bool = False
overwrite: bool = False
init: bool = False
extractors: str = ""
parser: str = "auto" parser: str = "auto"
extract: str = ""
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
overwrite: bool = False
index_only: bool = False
class UpdateCommandSchema(Schema): class UpdateCommandSchema(Schema):
resume: Optional[float] = 0 resume: Optional[float] = 0
@ -93,7 +84,7 @@ class ScheduleCommandSchema(Schema):
class ListCommandSchema(Schema): class ListCommandSchema(Schema):
filter_patterns: Optional[List[str]] = ['https://example.com'] filter_patterns: Optional[List[str]] = ['https://example.com']
filter_type: str = FilterTypeChoices.substring filter_type: str = FilterTypeChoices.substring
status: Optional[StatusChoices] = StatusChoices.indexed status: StatusChoices = StatusChoices.indexed
after: Optional[float] = 0 after: Optional[float] = 0
before: Optional[float] = 999999999999999 before: Optional[float] = 999999999999999
sort: str = 'bookmarked_at' sort: str = 'bookmarked_at'
@ -115,16 +106,16 @@ class RemoveCommandSchema(Schema):
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]') @router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
def cli_add(request, args: AddCommandSchema): def cli_add(request, args: AddCommandSchema):
from archivebox.cli.archivebox_add import add
result = add( result = add(
urls=args.urls, urls=args.urls,
tag=args.tag, tag=args.tag,
depth=args.depth, depth=args.depth,
update=args.update, update=args.update,
update_all=args.update_all,
index_only=args.index_only, index_only=args.index_only,
overwrite=args.overwrite, overwrite=args.overwrite,
init=args.init, extract=args.extract,
extractors=args.extractors,
parser=args.parser, parser=args.parser,
) )
@ -139,6 +130,8 @@ def cli_add(request, args: AddCommandSchema):
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]') @router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
def cli_update(request, args: UpdateCommandSchema): def cli_update(request, args: UpdateCommandSchema):
from archivebox.cli.archivebox_update import update
result = update( result = update(
resume=args.resume, resume=args.resume,
only_new=args.only_new, only_new=args.only_new,
@ -162,6 +155,8 @@ def cli_update(request, args: UpdateCommandSchema):
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]') @router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
def cli_schedule(request, args: ScheduleCommandSchema): def cli_schedule(request, args: ScheduleCommandSchema):
from archivebox.cli.archivebox_schedule import schedule
result = schedule( result = schedule(
import_path=args.import_path, import_path=args.import_path,
add=args.add, add=args.add,
@ -184,9 +179,11 @@ def cli_schedule(request, args: ScheduleCommandSchema):
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns] (use this endpoint with ?filter_type=search to search for snapshots)') @router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
def cli_list(request, args: ListCommandSchema): def cli_search(request, args: ListCommandSchema):
result = list_all( from archivebox.cli.archivebox_search import search
result = search(
filter_patterns=args.filter_patterns, filter_patterns=args.filter_patterns,
filter_type=args.filter_type, filter_type=args.filter_type,
status=args.status, status=args.status,
@ -221,6 +218,8 @@ def cli_list(request, args: ListCommandSchema):
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]') @router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
def cli_remove(request, args: RemoveCommandSchema): def cli_remove(request, args: RemoveCommandSchema):
from archivebox.cli.archivebox_remove import remove
result = remove( result = remove(
yes=True, # no way to interactively ask for confirmation via API, so we force yes yes=True, # no way to interactively ask for confirmation via API, so we force yes
delete=args.delete, delete=args.delete,

View file

@ -1,264 +1,117 @@
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
__command__ = 'archivebox' __command__ = 'archivebox'
import os import os
import sys import sys
import argparse
import threading
from time import sleep
from collections.abc import Mapping
from rich import print
from typing import Optional, List, IO, Union, Iterable
from pathlib import Path
from importlib import import_module from importlib import import_module
BUILTIN_LIST = list import rich_click as click
from rich import print
from archivebox.config.version import VERSION
CLI_DIR = Path(__file__).resolve().parent
# rewrite setup -> install for backwards compatibility
if len(sys.argv) > 1 and sys.argv[1] == 'setup':
from rich import print
print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
sys.argv[1] = 'install'
if '--debug' in sys.argv: if '--debug' in sys.argv:
os.environ['DEBUG'] = 'True' os.environ['DEBUG'] = 'True'
sys.argv.remove('--debug') sys.argv.remove('--debug')
# def list_subcommands() -> Dict[str, str]: class ArchiveBoxGroup(click.Group):
# """find and import all valid archivebox_<subcommand>.py files in CLI_DIR""" """lazy loading click group for archivebox commands"""
# COMMANDS = [] meta_commands = {
# for filename in os.listdir(CLI_DIR): 'help': 'archivebox.cli.archivebox_help.main',
# if is_cli_module(filename): 'version': 'archivebox.cli.archivebox_version.main',
# subcommand = filename.replace('archivebox_', '').replace('.py', '') }
# module = import_module('.archivebox_{}'.format(subcommand), __package__) setup_commands = {
# assert is_valid_cli_module(module, subcommand) 'init': 'archivebox.cli.archivebox_init.main',
# COMMANDS.append((subcommand, module.main.__doc__)) 'install': 'archivebox.cli.archivebox_install.main',
# globals()[subcommand] = module.main }
# display_order = lambda cmd: ( archive_commands = {
# display_first.index(cmd[0]) 'add': 'archivebox.cli.archivebox_add.main',
# if cmd[0] in display_first else 'remove': 'archivebox.cli.archivebox_remove.main',
# 100 + len(cmd[0]) 'update': 'archivebox.cli.archivebox_update.main',
# ) 'search': 'archivebox.cli.archivebox_search.main',
# return dict(sorted(COMMANDS, key=display_order)) 'status': 'archivebox.cli.archivebox_status.main',
'config': 'archivebox.cli.archivebox_config.main',
# just define it statically, it's much faster: 'schedule': 'archivebox.cli.archivebox_schedule.main',
SUBCOMMAND_MODULES = { 'server': 'archivebox.cli.archivebox_server.main',
'help': 'archivebox_help', 'shell': 'archivebox.cli.archivebox_shell.main',
'version': 'archivebox_version' , 'manage': 'archivebox.cli.archivebox_manage.main',
}
'init': 'archivebox_init', all_subcommands = {
'install': 'archivebox_install', **meta_commands,
############################################## **setup_commands,
'config': 'archivebox_config', **archive_commands,
'add': 'archivebox_add', }
'remove': 'archivebox_remove', renamed_commands = {
'update': 'archivebox_update', 'setup': 'install',
'list': 'archivebox_list', 'list': 'search',
'status': 'archivebox_status', 'import': 'add',
'archive': 'add',
'schedule': 'archivebox_schedule', 'export': 'search',
'server': 'archivebox_server', }
'shell': 'archivebox_shell',
'manage': 'archivebox_manage',
# 'oneshot': 'archivebox_oneshot',
}
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
# basic checks to make sure imported files are valid subcommands
is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
is_valid_cli_module = lambda module, subcommand: (
all(hasattr(module, attr) for attr in required_attrs)
and module.__command__.split(' ')[-1] == subcommand
)
class LazySubcommands(Mapping):
def keys(self):
return SUBCOMMAND_MODULES.keys()
def values(self):
return [self[key] for key in self.keys()]
def items(self):
return [(key, self[key]) for key in self.keys()]
def __getitem__(self, key):
module = import_module(f'.{SUBCOMMAND_MODULES[key]}', __package__)
assert is_valid_cli_module(module, key)
return module.main
def __iter__(self):
return iter(SUBCOMMAND_MODULES.keys())
def __len__(self):
return len(SUBCOMMAND_MODULES)
CLI_SUBCOMMANDS = LazySubcommands()
# these common commands will appear sorted before any others for ease-of-use def get_command(self, ctx, cmd_name):
meta_cmds = ('help', 'version') # dont require valid data folder at all # handle renamed commands
setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet if cmd_name in self.renamed_commands:
archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present new_name = self.renamed_commands[cmd_name]
fake_db = ("oneshot",) # use fake in-memory db print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
cmd_name = new_name
ctx.invoked_subcommand = cmd_name
display_first = (*meta_cmds, *setup_cmds, *archive_cmds) # handle lazy loading of commands
if cmd_name in self.all_subcommands:
return self._lazy_load(cmd_name)
# fall-back to using click's default command lookup
return super().get_command(ctx, cmd_name)
@classmethod
def _lazy_load(cls, cmd_name):
import_path = cls.all_subcommands[cmd_name]
modname, funcname = import_path.rsplit('.', 1)
# print(f'LAZY LOADING {import_path}')
mod = import_module(modname)
func = getattr(mod, funcname)
if not hasattr(func, '__doc__'):
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
# if not isinstance(cmd, click.BaseCommand):
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
return func
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting @click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
@click.option('--help', '-h', is_flag=True, help='Show help')
@click.version_option(version=VERSION, package_name='archivebox', message='%(version)s')
@click.pass_context
def cli(ctx, help=False):
"""ArchiveBox: The self-hosted internet archive"""
if help or ctx.invoked_subcommand is None:
ctx.invoke(ctx.command.get_command(ctx, 'help'))
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int: if ctx.invoked_subcommand in ArchiveBoxGroup.archive_commands:
""" # print('SETUP DJANGO AND CHECK DATA FOLDER')
Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks.
Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes.
"""
wait_for_all: bool = thread_names == ()
thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns)
should_wait = lambda thread: (
not thread_matches(thread, ignore_names)
and (wait_for_all or thread_matches(thread, thread_names)))
for tries in range(timeout):
all_threads = [*threading.enumerate()]
blocking_threads = [*filter(should_wait, all_threads)]
threads_summary = ', '.join(repr(t) for t in blocking_threads)
if blocking_threads:
sleep(1)
if tries == 5: # only show stderr message if we need to wait more than 5s
print(
f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
threads_summary,
file=sys.stderr,
)
else:
return tries
raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}')
def run_subcommand(subcommand: str,
subcommand_args: List[str] | None = None,
stdin: Optional[IO]=None,
pwd: Union[Path, str, None]=None) -> None:
"""Run a given ArchiveBox subcommand with the given list of args"""
subcommand_args = subcommand_args or []
from archivebox.misc.checks import check_migrations
from archivebox.config.django import setup_django from archivebox.config.django import setup_django
from archivebox.misc.checks import check_data_folder
setup_django()
check_data_folder()
# print('DATA_DIR is', DATA_DIR) def main(args=None, prog_name=None):
# print('pwd is', os.getcwd()) # show `docker run archivebox xyz` in help messages if running in docker
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
cmd_requires_db = (subcommand in archive_cmds) prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox')
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
check_db = cmd_requires_db and not init_pending
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
for ignore_pattern in ('help', '-h', '--help', 'version', '--version'):
if ignore_pattern in sys.argv[:4]:
cmd_requires_db = False
break
if subcommand in archive_cmds:
if cmd_requires_db:
check_migrations()
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
# wait for webhooks, signals, and other background jobs to finish before exit
wait_for_bg_threads_to_exit(timeout=60)
class NotProvided:
def __len__(self):
return 0
def __bool__(self):
return False
def __repr__(self):
return '<not provided>'
Omitted = Union[None, NotProvided]
OMITTED = NotProvided()
def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: str | None=None) -> None:
# print('STARTING CLI MAIN ENTRYPOINT')
args = sys.argv[1:] if args is OMITTED else args
stdin = sys.stdin if stdin is OMITTED else stdin
parser = argparse.ArgumentParser(
prog=__command__,
description='ArchiveBox: The self-hosted internet archive',
add_help=False,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--help', '-h',
action='store_true',
help=CLI_SUBCOMMANDS['help'].__doc__,
)
group.add_argument(
'--version',
action='store_true',
help=CLI_SUBCOMMANDS['version'].__doc__,
)
group.add_argument(
"subcommand",
type=str,
help= "The name of the subcommand to run",
nargs='?',
choices=CLI_SUBCOMMANDS.keys(),
default=None,
)
parser.add_argument(
"subcommand_args",
help="Arguments for the subcommand",
nargs=argparse.REMAINDER,
)
command = parser.parse_args(args or ())
if command.version:
command.subcommand = 'version'
elif command.help or command.subcommand is None:
command.subcommand = 'help'
if command.subcommand not in ('version',):
from archivebox.misc.logging_util import log_cli_command
log_cli_command(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin or None,
)
try: try:
run_subcommand( cli(args=args, prog_name=prog_name)
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin or None,
)
except KeyboardInterrupt: except KeyboardInterrupt:
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]') print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
if __name__ == '__main__':
main()

View file

@ -4,10 +4,10 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox add' __command__ = 'archivebox add'
import sys import sys
import argparse
from typing import IO, TYPE_CHECKING from typing import TYPE_CHECKING
import rich_click as click
from django.utils import timezone from django.utils import timezone
from django.db.models import QuerySet from django.db.models import QuerySet
@ -18,7 +18,6 @@ from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.django import setup_django from archivebox.config.django import setup_django
from archivebox.config.permissions import USER, HOSTNAME from archivebox.config.permissions import USER, HOSTNAME
from archivebox.misc.checks import check_data_folder from archivebox.misc.checks import check_data_folder
from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
from archivebox.parsers import PARSERS from archivebox.parsers import PARSERS
@ -29,22 +28,142 @@ if TYPE_CHECKING:
ORCHESTRATOR = None ORCHESTRATOR = None
# OLD VERSION:
# def add(urls: Union[str, List[str]],
# tag: str='',
# depth: int=0,
# update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
# update_all: bool=False,
# index_only: bool=False,
# overwrite: bool=False,
# # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
# init: bool=False,
# extractors: str="",
# parser: str="auto",
# created_by_id: int | None=None,
# out_dir: Path=DATA_DIR) -> List[Link]:
# """Add a new URL or list of URLs to your archive"""
# from core.models import Snapshot, Tag
# # from workers.supervisord_util import start_cli_workers, tail_worker_logs
# # from workers.tasks import bg_archive_link
# assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# extractors = extractors.split(",") if extractors else []
# if init:
# run_subcommand('init', stdin=None, pwd=out_dir)
# # Load list of links from the existing index
# check_data_folder()
# # worker = start_cli_workers()
# new_links: List[Link] = []
# all_links = load_main_index(out_dir=out_dir)
# log_importing_started(urls=urls, depth=depth, index_only=index_only)
# if isinstance(urls, str):
# # save verbatim stdin to sources
# write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
# elif isinstance(urls, list):
# # save verbatim args to sources
# write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
# new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
# # If we're going one level deeper, download each link and look for more links
# new_links_depth = []
# if new_links and depth == 1:
# log_crawl_started(new_links)
# for new_link in new_links:
# try:
# downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
# new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
# except Exception as err:
# stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
# imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
# new_links = dedupe_links(all_links, imported_links)
# write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
# all_links = load_main_index(out_dir=out_dir)
# tags = [
# Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
# for name in tag.split(',')
# if name.strip()
# ]
# if tags:
# for link in imported_links:
# snapshot = Snapshot.objects.get(url=link.url)
# snapshot.tags.add(*tags)
# snapshot.tags_str(nocache=True)
# snapshot.save()
# # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
# if index_only:
# # mock archive all the links using the fake index_only extractor method in order to update their state
# if overwrite:
# archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
# else:
# archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
# else:
# # fully run the archive extractor methods for each link
# archive_kwargs = {
# "out_dir": out_dir,
# "created_by_id": created_by_id,
# }
# if extractors:
# archive_kwargs["methods"] = extractors
# stderr()
# ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
# if update:
# stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
# archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
# elif update_all:
# stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
# archive_links(all_links, overwrite=overwrite, **archive_kwargs)
# elif overwrite:
# stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
# archive_links(imported_links, overwrite=True, **archive_kwargs)
# elif new_links:
# stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
# archive_links(new_links, overwrite=False, **archive_kwargs)
# # tail_worker_logs(worker['stdout_logfile'])
# # if CAN_UPGRADE:
# # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
# return new_links
def add(urls: str | list[str], def add(urls: str | list[str],
tag: str='',
depth: int=0, depth: int=0,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW, tag: str='',
update_all: bool=False,
index_only: bool=False,
overwrite: bool=False,
extractors: str="",
parser: str="auto", parser: str="auto",
extract: str="",
persona: str='Default', persona: str='Default',
overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
index_only: bool=False,
bg: bool=False, bg: bool=False,
created_by_id: int | None=None) -> QuerySet['Snapshot']: created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive""" """Add a new URL or list of URLs to your archive"""
global ORCHESTRATOR global ORCHESTRATOR
depth = int(depth)
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# 0. setup abx, django, check_data_folder # 0. setup abx, django, check_data_folder
@ -56,7 +175,6 @@ def add(urls: str | list[str],
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = created_by_id or get_or_create_system_user_pk() created_by_id = created_by_id or get_or_create_system_user_pk()
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
@ -72,7 +190,7 @@ def add(urls: str | list[str],
'ONLY_NEW': not update, 'ONLY_NEW': not update,
'INDEX_ONLY': index_only, 'INDEX_ONLY': index_only,
'OVERWRITE': overwrite, 'OVERWRITE': overwrite,
'EXTRACTORS': extractors, 'EXTRACTORS': extract,
'DEFAULT_PERSONA': persona or 'Default', 'DEFAULT_PERSONA': persona or 'Default',
}) })
# 3. create a new Crawl pointing to the Seed # 3. create a new Crawl pointing to the Seed
@ -91,118 +209,23 @@ def add(urls: str | list[str],
return crawl.snapshot_set.all() return crawl.snapshot_set.all()
def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=None) -> None: @click.command()
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
@click.argument('urls', nargs=-1, type=click.Path())
def main(**kwargs):
"""Add a new URL or list of URLs to your archive""" """Add a new URL or list of URLs to your archive"""
parser = argparse.ArgumentParser(
prog=__command__,
description=add.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--tag', '-t',
type=str,
default='',
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
)
parser.add_argument(
'--update', #'-u',
action='store_true',
default=not ARCHIVING_CONFIG.ONLY_NEW, # when ONLY_NEW=True we skip updating old links
help="Also retry previously skipped/failed links when adding new links",
)
parser.add_argument(
'--update-all', #'-n',
action='store_true',
default=False,
help="Also update ALL links in index when finished adding new links",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Add the links to the main index without archiving them",
)
parser.add_argument(
'urls',
nargs='*',
type=str,
default=None,
help=(
'URLs or paths to archive e.g.:\n'
' https://getpocket.com/users/USERNAME/feed/all\n'
' https://example.com/some/rss/feed.xml\n'
' https://example.com\n'
' ~/Downloads/firefox_bookmarks_export.html\n'
' ~/Desktop/sites_list.csv\n'
)
)
parser.add_argument(
"--depth",
action="store",
default=0,
choices=[0, 1],
type=int,
help="Recursively archive all linked pages up to this many hops away"
)
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Re-archive URLs from scratch, overwriting any existing files"
)
parser.add_argument(
"--extract", '-e',
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
parser.add_argument(
"--parser",
type=str,
help="Parser used to read inputted URLs.",
default="auto",
choices=["auto", *PARSERS.keys()],
)
parser.add_argument(
"--persona",
type=str,
help="Name of accounts persona to use when archiving.",
default="Default",
)
parser.add_argument(
"--bg",
default=False,
action="store_true",
help="Enqueue a background worker to complete the crawl instead of running it immediately",
)
command = parser.parse_args(args or ())
urls = command.urls
stdin_urls = '' add(**kwargs)
if not urls:
stdin_urls = accept_stdin(stdin)
if (stdin_urls and urls) or (not stdin and not urls):
stderr(
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
color='red',
)
raise SystemExit(2)
add(
urls=stdin_urls or urls,
depth=command.depth,
tag=command.tag,
update=command.update,
update_all=command.update_all,
index_only=command.index_only,
overwrite=command.overwrite,
extractors=command.extract,
parser=command.parser,
persona=command.persona,
bg=command.bg,
)
if __name__ == '__main__': if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin) main()

View file

@ -12,7 +12,130 @@ from typing import Optional, List, IO
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, accept_stdin from archivebox.misc.logging_util import SmartFormatter, accept_stdin
from ..main import config
# @enforce_types
def config(config_options_str: Optional[str]=None,
config_options: Optional[List[str]]=None,
get: bool=False,
set: bool=False,
search: bool=False,
reset: bool=False,
out_dir: Path=DATA_DIR) -> None:
"""Get and set your ArchiveBox project configuration values"""
from rich import print
check_data_folder()
if config_options and config_options_str:
stderr(
'[X] You should either pass config values as an arguments '
'or via stdin, but not both.\n',
color='red',
)
raise SystemExit(2)
elif config_options_str:
config_options = config_options_str.split('\n')
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
CONFIGS = archivebox.pm.hook.get_CONFIGS()
config_options = config_options or []
no_args = not (get or set or reset or config_options)
matching_config = {}
if search:
if config_options:
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
for config_section in CONFIGS.values():
aliases = config_section.aliases
for search_key in config_options:
# search all aliases in the section
for alias_key, key in aliases.items():
if search_key.lower() in alias_key.lower():
matching_config[key] = config_section.model_dump()[key]
# search all keys and values in the section
for existing_key, value in config_section.model_dump().items():
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
matching_config[existing_key] = value
print(printable_config(matching_config))
raise SystemExit(not matching_config)
elif get or no_args:
if config_options:
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
if failed_config:
stderr()
stderr('[X] These options failed to get', color='red')
stderr(' {}'.format('\n '.join(config_options)))
raise SystemExit(1)
else:
matching_config = FLAT_CONFIG
print(printable_config(matching_config))
raise SystemExit(not matching_config)
elif set:
new_config = {}
failed_options = []
for line in config_options:
if line.startswith('#') or not line.strip():
continue
if '=' not in line:
stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
stderr(f' {line}')
raise SystemExit(2)
raw_key, val = line.split('=', 1)
raw_key = raw_key.upper().strip()
key = get_real_name(raw_key)
if key != raw_key:
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
if key in FLAT_CONFIG:
new_config[key] = val.strip()
else:
failed_options.append(line)
if new_config:
before = FLAT_CONFIG
matching_config = write_config_file(new_config)
after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
print(printable_config(matching_config))
side_effect_changes = {}
for key, val in after.items():
if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
side_effect_changes[key] = after[key]
# import ipdb; ipdb.set_trace()
if side_effect_changes:
stderr()
stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
if failed_options:
stderr()
stderr('[X] These options failed to set (check for typos):', color='red')
stderr(' {}'.format('\n '.join(failed_options)))
raise SystemExit(1)
elif reset:
stderr('[X] This command is not implemented yet.', color='red')
stderr(' Please manually remove the relevant lines from your config file:')
raise SystemExit(2)
else:
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
stderr(' archivebox config')
stderr(' archivebox config --get SOME_KEY')
stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
raise SystemExit(2)
@docstring(config.__doc__) @docstring(config.__doc__)

View file

@ -1,32 +1,105 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
__command__ = 'archivebox help' __command__ = 'archivebox help'
import sys import os
import argparse
from pathlib import Path from pathlib import Path
from typing import Optional, List, IO
from archivebox.misc.util import docstring import click
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from rich import print
from archivebox.config import DATA_DIR from rich.panel import Panel
from ..main import help
@docstring(help.__doc__) def help() -> None:
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: """Print the ArchiveBox help message and usage"""
parser = argparse.ArgumentParser(
prog=__command__, from archivebox.cli import ArchiveBoxGroup
description=help.__doc__, from archivebox.config import CONSTANTS
add_help=True, from archivebox.config.permissions import IN_DOCKER
formatter_class=SmartFormatter, from archivebox.misc.logging_util import log_cli_command
log_cli_command('help', [], None, '.')
COMMANDS_HELP_TEXT = '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.meta_commands.keys()
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.setup_commands.keys()
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.archive_commands.keys()
) )
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
help(out_dir=Path(pwd) if pwd else DATA_DIR) DOCKER_USAGE = '''
[dodger_blue3]Docker Usage:[/dodger_blue3]
[grey53]# using Docker Compose:[/grey53]
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
[grey53]# using Docker:[/grey53]
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
''' if IN_DOCKER else ''
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
print(f'''{DOCKER_USAGE}
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
[deep_sky_blue4]Commands:[/deep_sky_blue4]
{COMMANDS_HELP_TEXT}
[deep_sky_blue4]Documentation:[/deep_sky_blue4]
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
''')
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
EXAMPLE_USAGE = f'''
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# make sure database is up-to-date (safe to run multiple times)[/grey53]
[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53]
[dark_green]archivebox[/dark_green] [green]status[/green] [grey53]# get a health checkup report on your collection[/grey53]
[dark_green]archivebox[/dark_green] [green]update[/green] [grey53]# retry any previously failed or interrupted archiving tasks[/grey53]
[violet]Hint:[/violet] [i]More example usage:[/i]
[dark_green]archivebox[/dark_green] [green]add[/green] --depth=1 "https://example.com/some/page"
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
'''
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
else:
DATA_SETUP_HELP = '\n'
if IN_DOCKER:
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
@click.command()
@click.option('--help', '-h', is_flag=True, help='Show help')
def main(**kwargs):
"""Print the ArchiveBox help message and usage"""
return help()
if __name__ == '__main__': if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin) main()

View file

@ -5,13 +5,193 @@ __command__ = 'archivebox init'
import sys import sys
import argparse import argparse
from pathlib import Path
from typing import Optional, List, IO from typing import Optional, List, IO
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.logging_util import SmartFormatter, reject_stdin
from ..main import init
def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Path=DATA_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
from core.models import Snapshot
from rich import print
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
if is_empty and not existing_index:
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
print('[green]----------------------------------------------------------------------[/green]')
elif existing_index:
# TODO: properly detect and print the existing version in current index as well
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
print('[green]----------------------------------------------------------------------[/green]')
else:
if force:
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
else:
print(
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
" You must run init in a completely empty directory, or an existing data folder.\n\n"
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
" then run and run 'archivebox init' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
)
)
raise SystemExit(2)
if existing_index:
print('\n[green][*] Verifying archive folder structure...[/green]')
else:
print('\n[green][+] Building archive folder structure...[/green]')
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
# create the .archivebox_id file with a unique ID for this collection
from archivebox.config.paths import _get_collection_id
_get_collection_id(CONSTANTS.DATA_DIR, force_create=True)
# create the ArchiveBox.conf file
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
else:
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
for migration_line in apply_migrations(out_dir):
sys.stdout.write(f' {migration_line}\n')
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
print()
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
# from django.contrib.auth.models import User
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
# call_command("createsuperuser", interactive=True)
print()
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {}
if existing_index:
all_links = load_main_index(out_dir=out_dir, warn=False)
print(f' √ Loaded {all_links.count()} links from existing main index.')
if quick:
print(' > Skipping full snapshot directory check (quick mode)')
else:
try:
# Links in data folders that dont match their timestamp
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
if fixed:
print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]')
if cant_fix:
print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]')
# Links in JSON index but not in main index
orphaned_json_links = {
link.url: link
for link in parse_json_main_index(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_json_links:
pending_links.update(orphaned_json_links)
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link.url: link
for link in parse_json_links_details(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_data_dir_links:
pending_links.update(orphaned_data_dir_links)
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
# Links in invalid/duplicate data dirs
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
}
if invalid_folders:
print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]')
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items()))
print()
print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:')
print(' archivebox status')
print(' archivebox list --status=invalid')
except (KeyboardInterrupt, SystemExit):
print(file=sys.stderr)
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
print(file=sys.stderr)
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
print(' archivebox init --quick', file=sys.stderr)
raise SystemExit(1)
write_main_index(list(pending_links.values()), out_dir=out_dir)
print('\n[green]----------------------------------------------------------------------[/green]')
from django.contrib.auth.models import User
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
if existing_index:
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
else:
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
json_index = out_dir / CONSTANTS.JSON_INDEX_FILENAME
html_index = out_dir / CONSTANTS.HTML_INDEX_FILENAME
index_name = f"{date.today()}_index_old"
if os.access(json_index, os.F_OK):
json_index.rename(f"{index_name}.json")
if os.access(html_index, os.F_OK):
html_index.rename(f"{index_name}.html")
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
from archivebox.config.common import STORAGE_CONFIG
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
if install:
run_subcommand('install', pwd=out_dir)
if Snapshot.objects.count() < 25: # hide the hints for experienced users
print()
print(' [violet]Hint:[/violet] To view your archive index, run:')
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
print()
print(' To add new links, you can run:')
print(" archivebox add < ~/some/path/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
@docstring(init.__doc__) @docstring(init.__doc__)

View file

@ -3,6 +3,7 @@
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
__command__ = 'archivebox install' __command__ = 'archivebox install'
import os
import sys import sys
import argparse import argparse
from pathlib import Path from pathlib import Path
@ -11,11 +12,145 @@ from typing import Optional, List, IO
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.logging_util import SmartFormatter, reject_stdin
from ..main import install
def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
"""Automatically install all ArchiveBox dependencies and extras"""
# if running as root:
# - run init to create index + lib dir
# - chown -R 911 DATA_DIR
# - install all binaries as root
# - chown -R 911 LIB_DIR
# else:
# - run init to create index + lib dir as current user
# - install all binaries as current user
# - recommend user re-run with sudo if any deps need to be installed as root
from rich import print
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.paths import get_or_create_working_lib_dir
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in
print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
# we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
if IS_ROOT:
EUID = os.geteuid()
# if we have sudo/root permissions, take advantage of them just while installing dependencies
print()
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
print()
LIB_DIR = get_or_create_working_lib_dir()
package_manager_names = ', '.join(
f'[yellow]{binprovider.name}[/yellow]'
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
if not binproviders or (binproviders and binprovider.name in binproviders)
)
print(f'[+] Setting up package managers {package_manager_names}...')
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
if binproviders and binprovider.name not in binproviders:
continue
try:
binprovider.setup()
except Exception:
# it's ok, installing binaries below will automatically set up package managers as needed
# e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
# the next package that depends on npm will automatically call binprovider.setup() during its own install
pass
print()
for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
# obviously must already be installed if we are running
continue
if binaries and binary.name not in binaries:
continue
providers = ' [grey53]or[/grey53] '.join(
provider.name for provider in binary.binproviders_supported
if not binproviders or (binproviders and provider.name in binproviders)
)
if not providers:
continue
print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
try:
with SudoPermission(uid=0, fallback=True):
# print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
if binproviders:
providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
for binprovider_name in binproviders:
if binprovider_name not in providers_supported_by_binary:
continue
try:
if dry_run:
# always show install commands when doing a dry run
sys.stderr.write("\033[2;49;90m") # grey53
result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
sys.stderr.write("\033[00m\n") # reset
else:
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
if result and result['loaded_version']:
break
except Exception as e:
print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
else:
if dry_run:
sys.stderr.write("\033[2;49;90m") # grey53
binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
sys.stderr.write("\033[00m\n") # reset
else:
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
if IS_ROOT and LIB_DIR:
with SudoPermission(uid=0):
if ARCHIVEBOX_USER == 0:
os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
else:
os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
except Exception as e:
print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
if binaries and len(binaries) == 1:
# if we are only installing a single binary, raise the exception so the user can see what went wrong
raise
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
extra_args = []
if binproviders:
extra_args.append(f'--binproviders={",".join(binproviders)}')
if binaries:
extra_args.append(f'--binaries={",".join(binaries)}')
proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=out_dir)
raise SystemExit(proc.returncode)
@docstring(install.__doc__) @docstring(install.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog=__command__, prog=__command__,
description=install.__doc__, description=install.__doc__,

View file

@ -1,139 +0,0 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
import sys
import argparse
from pathlib import Path
from typing import Optional, List, IO
from archivebox.config import DATA_DIR
from archivebox.misc.util import docstring
from archivebox.misc.logging_util import SmartFormatter, reject_stdin, stderr
from ..main import list_all
from ..index import (
LINK_FILTERS,
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
@docstring(list_all.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=list_all.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--csv', #'-c',
type=str,
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
default=None,
)
group.add_argument(
'--json', #'-j',
action='store_true',
help="Print the output in JSON format with all columns included",
)
group.add_argument(
'--html',
action='store_true',
help="Print the output in HTML format"
)
parser.add_argument(
'--with-headers',
action='store_true',
help='Include the headers in the output document'
)
parser.add_argument(
'--sort', #'-s',
type=str,
help="List the links sorted using the given key, e.g. timestamp or updated",
default=None,
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only links bookmarked before (less than) the given timestamp",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only links bookmarked after (greater than or equal to) the given timestamp",
default=None,
)
parser.add_argument(
'--status',
type=str,
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
default='indexed',
help=(
'List only links or data directories that have the given status\n'
f' indexed {get_indexed_folders.__doc__} (the default)\n'
f' archived {get_archived_folders.__doc__}\n'
f' unarchived {get_unarchived_folders.__doc__}\n'
'\n'
f' present {get_present_folders.__doc__}\n'
f' valid {get_valid_folders.__doc__}\n'
f' invalid {get_invalid_folders.__doc__}\n'
'\n'
f' duplicate {get_duplicate_folders.__doc__}\n'
f' orphaned {get_orphaned_folders.__doc__}\n'
f' corrupted {get_corrupted_folders.__doc__}\n'
f' unrecognized {get_unrecognized_folders.__doc__}\n'
)
)
parser.add_argument(
'--filter-type', '-t',
type=str,
choices=(*LINK_FILTERS.keys(), 'search'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'filter_patterns',
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns'
)
command = parser.parse_args(args or ())
reject_stdin(stdin)
if command.with_headers and not (command.json or command.html or command.csv):
stderr(
'[X] --with-headers can only be used with --json, --html or --csv options\n',
color='red',
)
raise SystemExit(2)
matching_folders = list_all(
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
status=command.status,
after=command.after,
before=command.before,
sort=command.sort,
csv=command.csv,
json=command.json,
html=command.html,
with_headers=command.with_headers,
out_dir=Path(pwd) if pwd else DATA_DIR,
)
raise SystemExit(not matching_folders)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -9,7 +9,27 @@ from typing import Optional, List, IO
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from ..main import manage
# @enforce_types
def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
"""Run an ArchiveBox Django management command"""
check_data_folder()
from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('')
# import ipdb; ipdb.set_trace()
execute_from_command_line(['manage.py', *(args or ['help'])])
@docstring(manage.__doc__) @docstring(manage.__doc__)

View file

@ -1,73 +1,98 @@
#!/usr/bin/env python3 # #!/usr/bin/env python3
__package__ = 'archivebox.cli' ################## DEPRECATED IN FAVOR OF abx-dl #####################
__command__ = 'archivebox oneshot' # https://github.com/ArchiveBox/abx-dl
import sys # __package__ = 'archivebox.cli'
import argparse # __command__ = 'archivebox oneshot'
from pathlib import Path # import sys
from typing import List, Optional, IO # import argparse
from archivebox.misc.util import docstring # from pathlib import Path
from archivebox.config import DATA_DIR # from typing import List, Optional, IO
from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
from ..main import oneshot # from archivebox.misc.util import docstring
# from archivebox.config import DATA_DIR
# from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
@docstring(oneshot.__doc__) # @enforce_types
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: # def oneshot(url: str, extractors: str="", out_dir: Path=DATA_DIR, created_by_id: int | None=None) -> List[Link]:
parser = argparse.ArgumentParser( # """
prog=__command__, # Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
description=oneshot.__doc__, # You can run this to archive single pages without needing to create a whole collection with archivebox init.
add_help=True, # """
formatter_class=SmartFormatter, # oneshot_link, _ = parse_links_memory([url])
) # if len(oneshot_link) > 1:
parser.add_argument( # stderr(
'url', # '[X] You should pass a single url to the oneshot command',
type=str, # color='red'
default=None, # )
help=( # raise SystemExit(2)
'URLs or paths to archive e.g.:\n'
' https://getpocket.com/users/USERNAME/feed/all\n'
' https://example.com/some/rss/feed.xml\n'
' https://example.com\n'
' ~/Downloads/firefox_bookmarks_export.html\n'
' ~/Desktop/sites_list.csv\n'
)
)
parser.add_argument(
"--extract",
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
parser.add_argument(
'--out-dir',
type=str,
default=DATA_DIR,
help= "Path to save the single archive folder to, e.g. ./example.com_archive"
)
command = parser.parse_args(args or ())
stdin_url = None
url = command.url
if not url:
stdin_url = accept_stdin(stdin)
if (stdin_url and url) or (not stdin and not url): # methods = extractors.split(",") if extractors else ignore_methods(['title'])
stderr( # archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, created_by_id=created_by_id)
'[X] You must pass a URL/path to add via stdin or CLI arguments.\n', # return oneshot_link
color='red',
)
raise SystemExit(2)
oneshot(
url=stdin_url or url,
out_dir=Path(command.out_dir).resolve(),
extractors=command.extract,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)
# @docstring(oneshot.__doc__)
# def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
# parser = argparse.ArgumentParser(
# prog=__command__,
# description=oneshot.__doc__,
# add_help=True,
# formatter_class=SmartFormatter,
# )
# parser.add_argument(
# 'url',
# type=str,
# default=None,
# help=(
# 'URLs or paths to archive e.g.:\n'
# ' https://getpocket.com/users/USERNAME/feed/all\n'
# ' https://example.com/some/rss/feed.xml\n'
# ' https://example.com\n'
# ' ~/Downloads/firefox_bookmarks_export.html\n'
# ' ~/Desktop/sites_list.csv\n'
# )
# )
# parser.add_argument(
# "--extract",
# type=str,
# help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
# This does not take precedence over the configuration",
# default=""
# )
# parser.add_argument(
# '--out-dir',
# type=str,
# default=DATA_DIR,
# help= "Path to save the single archive folder to, e.g. ./example.com_archive"
# )
# command = parser.parse_args(args or ())
# stdin_url = None
# url = command.url
# if not url:
# stdin_url = accept_stdin(stdin)
# if (stdin_url and url) or (not stdin and not url):
# stderr(
# '[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
# color='red',
# )
# raise SystemExit(2)
# oneshot(
# url=stdin_url or url,
# out_dir=Path(command.out_dir).resolve(),
# extractors=command.extract,
# )
# if __name__ == '__main__':
# main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -8,10 +8,93 @@ import argparse
from pathlib import Path from pathlib import Path
from typing import Optional, List, IO from typing import Optional, List, IO
from django.db.models import QuerySet
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, accept_stdin from archivebox.misc.logging_util import SmartFormatter, accept_stdin
from ..main import remove from archivebox.index.schema import Link
def remove(filter_str: Optional[str]=None,
filter_patterns: Optional[list[str]]=None,
filter_type: str='exact',
snapshots: Optional[QuerySet]=None,
after: Optional[float]=None,
before: Optional[float]=None,
yes: bool=False,
delete: bool=False,
out_dir: Path=DATA_DIR) -> list[Link]:
"""Remove the specified URLs from the archive"""
check_data_folder()
if snapshots is None:
if filter_str and filter_patterns:
stderr(
'[X] You should pass either a pattern as an argument, '
'or pass a list of patterns via stdin, but not both.\n',
color='red',
)
raise SystemExit(2)
elif not (filter_str or filter_patterns):
stderr(
'[X] You should pass either a pattern as an argument, '
'or pass a list of patterns via stdin.',
color='red',
)
stderr()
hint(('To remove all urls you can run:',
'archivebox remove --filter-type=regex ".*"'))
stderr()
raise SystemExit(2)
elif filter_str:
filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
list_kwargs = {
"filter_patterns": filter_patterns,
"filter_type": filter_type,
"after": after,
"before": before,
}
if snapshots:
list_kwargs["snapshots"] = snapshots
log_list_started(filter_patterns, filter_type)
timer = TimedProgress(360, prefix=' ')
try:
snapshots = list_links(**list_kwargs)
finally:
timer.end()
if not snapshots.exists():
log_removal_finished(0, 0)
raise SystemExit(1)
log_links = [link.as_link() for link in snapshots]
log_list_finished(log_links)
log_removal_started(log_links, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ')
try:
for snapshot in snapshots:
if delete:
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
finally:
timer.end()
to_remove = snapshots.count()
from .search import flush_search_index
flush_search_index(snapshots=snapshots)
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
all_snapshots = load_main_index(out_dir=out_dir)
log_removal_finished(all_snapshots.count(), to_remove)
return all_snapshots
@docstring(remove.__doc__) @docstring(remove.__doc__)

View file

@ -11,7 +11,139 @@ from typing import Optional, List, IO
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.logging_util import SmartFormatter, reject_stdin
from ..main import schedule from archivebox.config.common import ARCHIVING_CONFIG
# @enforce_types
def schedule(add: bool=False,
show: bool=False,
clear: bool=False,
foreground: bool=False,
run_all: bool=False,
quiet: bool=False,
every: Optional[str]=None,
tag: str='',
depth: int=0,
overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
import_path: Optional[str]=None,
out_dir: Path=DATA_DIR):
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder()
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
cron = CronTab(user=True)
cron = dedupe_cron_jobs(cron)
if clear:
print(cron.remove_all(comment=CRON_COMMENT))
cron.write()
raise SystemExit(0)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if every or add:
every = every or 'day'
quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s)
cmd = [
'cd',
quoted(out_dir),
'&&',
quoted(ARCHIVEBOX_BINARY.load().abspath),
*([
'add',
*(['--overwrite'] if overwrite else []),
*(['--update'] if update else []),
*([f'--tag={tag}'] if tag else []),
f'--depth={depth}',
f'"{import_path}"',
] if import_path else ['update']),
'>>',
quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
'2>&1',
]
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
if every in ('minute', 'hour', 'day', 'month', 'year'):
set_every = getattr(new_job.every(), every)
set_every()
elif CronSlices.is_valid(every):
new_job.setall(every)
else:
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
stderr(' It must be one of minute/hour/day/month')
stderr(' or a quoted cron-format schedule like:')
stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(1)
cron = dedupe_cron_jobs(cron)
cron.write()
total_runs = sum(j.frequency_per_year() for j in cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not quiet:
stderr()
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
stderr(' Congrats on being an enthusiastic internet archiver! 👌')
stderr()
stderr(' Make sure you have enough storage space available to hold all the data.')
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
stderr('')
elif show:
if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs))
else:
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(0)
cron = CronTab(user=True)
cron = dedupe_cron_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if foreground or run_all:
if not existing_jobs:
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(1)
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
if run_all:
try:
for job in existing_jobs:
sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n')
sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
sys.stdout.flush()
job.run()
sys.stdout.write(f'\r{job.command.split("/archivebox ")[-1]}\n')
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
raise SystemExit(1)
if foreground:
try:
for job in existing_jobs:
print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
for result in cron.run_scheduler():
print(result)
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
raise SystemExit(1)
# if CAN_UPGRADE:
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
@docstring(schedule.__doc__) @docstring(schedule.__doc__)

View file

@ -0,0 +1,164 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox search'
from pathlib import Path
from typing import Optional, List, Iterable
import rich_click as click
from rich import print
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.index import LINK_FILTERS
from archivebox.index.schema import Link
from archivebox.misc.logging import stderr
from archivebox.misc.util import enforce_types, docstring
STATUS_CHOICES = [
'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
]
def list_links(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='substring',
after: Optional[float]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> Iterable[Link]:
from archivebox.index import load_main_index
from archivebox.index import snapshot_filter
if snapshots:
all_snapshots = snapshots
else:
all_snapshots = load_main_index(out_dir=out_dir)
if after is not None:
all_snapshots = all_snapshots.filter(timestamp__gte=after)
if before is not None:
all_snapshots = all_snapshots.filter(timestamp__lt=before)
if filter_patterns:
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
if not all_snapshots:
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
return all_snapshots
def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
from archivebox.misc.checks import check_data_folder
from archivebox.index import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
check_data_folder()
STATUS_FUNCTIONS = {
"indexed": get_indexed_folders,
"archived": get_archived_folders,
"unarchived": get_unarchived_folders,
"present": get_present_folders,
"valid": get_valid_folders,
"invalid": get_invalid_folders,
"duplicate": get_duplicate_folders,
"orphaned": get_orphaned_folders,
"corrupted": get_corrupted_folders,
"unrecognized": get_unrecognized_folders,
}
try:
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
except KeyError:
raise ValueError('Status not recognized.')
@enforce_types
def search(filter_patterns: list[str] | None=None,
filter_type: str='substring',
status: str='indexed',
before: float | None=None,
after: float | None=None,
sort: str | None=None,
json: bool=False,
html: bool=False,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
raise SystemExit(2)
snapshots = list_links(
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
before=before,
after=after,
)
if sort:
snapshots = snapshots.order_by(sort)
folders = list_folders(
links=snapshots,
status=status,
out_dir=DATA_DIR,
)
if json:
from archivebox.index.json import generate_json_index_from_links
output = generate_json_index_from_links(folders.values(), with_headers)
elif html:
from archivebox.index.html import generate_index_from_links
output = generate_index_from_links(folders.values(), with_headers)
elif csv:
from archivebox.index.csv import links_to_csv
output = links_to_csv(folders.values(), csv.split(','), with_headers)
else:
from archivebox.misc.logging_util import printable_folders
output = printable_folders(folders, with_headers)
print(output)
return output
@click.command()
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
@click.help_option('--help', '-h')
@click.argument('filter_patterns', nargs=-1)
@docstring(search.__doc__)
def main(**kwargs):
return search(**kwargs)
if __name__ == '__main__':
main()

View file

@ -12,7 +12,81 @@ from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.logging_util import SmartFormatter, reject_stdin
from ..main import server
# @enforce_types
def server(runserver_args: Optional[List[str]]=None,
reload: bool=False,
debug: bool=False,
init: bool=False,
quick_init: bool=False,
createsuperuser: bool=False,
daemonize: bool=False,
out_dir: Path=DATA_DIR) -> None:
"""Run the ArchiveBox HTTP server"""
from rich import print
runserver_args = runserver_args or []
if init:
run_subcommand('init', stdin=None, pwd=out_dir)
print()
elif quick_init:
run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
print()
if createsuperuser:
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
print()
check_data_folder()
from django.core.management import call_command
from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
print()
# print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]')
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
print(' [green]archivebox manage createsuperuser[/green]')
print()
host = '127.0.0.1'
port = '8000'
try:
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
if ':' in host_and_port:
host, port = host_and_port.split(':')
else:
if '.' in host_and_port:
host = host_and_port
else:
port = host_and_port
except IndexError:
pass
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if SHELL_CONFIG.DEBUG:
if not reload:
runserver_args.append('--noreload') # '--insecure'
call_command("runserver", *runserver_args)
else:
from workers.supervisord_util import start_server_workers
print()
start_server_workers(host=host, port=port, daemonize=False)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
@docstring(server.__doc__) @docstring(server.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:

View file

@ -11,7 +11,19 @@ from typing import Optional, List, IO
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.logging_util import SmartFormatter, reject_stdin
from ..main import shell
#@enforce_types
def shell(out_dir: Path=DATA_DIR) -> None:
"""Enter an interactive ArchiveBox Django shell"""
check_data_folder()
from django.core.management import call_command
call_command("shell_plus")
@docstring(shell.__doc__) @docstring(shell.__doc__)

View file

@ -8,10 +8,114 @@ import argparse
from pathlib import Path from pathlib import Path
from typing import Optional, List, IO from typing import Optional, List, IO
from rich import print
from archivebox.misc.util import docstring from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.logging_util import SmartFormatter, reject_stdin
from ..main import status
# @enforce_types
def status(out_dir: Path=DATA_DIR) -> None:
"""Print out some info and statistics about the archive collection"""
check_data_folder()
from core.models import Snapshot
from django.contrib.auth import get_user_model
User = get_user_model()
print('{green}[*] Scanning archive main index...{reset}'.format(**SHELL_CONFIG.ANSI))
print(SHELL_CONFIG.ANSI['lightyellow'], f' {out_dir}/*', SHELL_CONFIG.ANSI['reset'])
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
size = printable_filesize(num_bytes)
print(f' Index size: {size} across {num_files} files')
print()
links = load_main_index(out_dir=out_dir)
num_sql_links = links.count()
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
print()
print('{green}[*] Scanning archive data directories...{reset}'.format(**SHELL_CONFIG.ANSI))
print(SHELL_CONFIG.ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', SHELL_CONFIG.ANSI['reset'])
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print(SHELL_CONFIG.ANSI['black'])
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
num_archived = len(get_archived_folders(links, out_dir=out_dir))
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
num_present = len(get_present_folders(links, out_dir=out_dir))
num_valid = len(get_valid_folders(links, out_dir=out_dir))
print()
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
duplicate = get_duplicate_folders(links, out_dir=out_dir)
orphaned = get_orphaned_folders(links, out_dir=out_dir)
corrupted = get_corrupted_folders(links, out_dir=out_dir)
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
print(SHELL_CONFIG.ANSI['reset'])
if num_indexed:
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**SHELL_CONFIG.ANSI))
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
if orphaned:
print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**SHELL_CONFIG.ANSI))
print(' archivebox init')
if num_invalid:
print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**SHELL_CONFIG.ANSI))
print(' archivebox init')
print()
print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**SHELL_CONFIG.ANSI))
print(SHELL_CONFIG.ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', SHELL_CONFIG.ANSI['reset'])
users = get_admins().values_list('username', flat=True)
print(f' UI users {len(users)}: {", ".join(users)}')
last_login = User.objects.order_by('last_login').last()
if last_login:
print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
if last_downloaded:
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
if not users:
print()
print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**SHELL_CONFIG.ANSI))
print(' archivebox manage createsuperuser')
print()
for snapshot in links.order_by('-downloaded_at')[:10]:
if not snapshot.downloaded_at:
continue
print(
SHELL_CONFIG.ANSI['black'],
(
f' > {str(snapshot.downloaded_at)[:16]} '
f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
f'"{snapshot.title}": {snapshot.url}'
)[:SHELL_CONFIG.TERM_WIDTH],
SHELL_CONFIG.ANSI['reset'],
)
print(SHELL_CONFIG.ANSI['black'], ' ...', SHELL_CONFIG.ANSI['reset'])
@docstring(status.__doc__) @docstring(status.__doc__)

View file

@ -24,7 +24,92 @@ from archivebox.index import (
from archivebox.misc.logging_util import SmartFormatter, accept_stdin from archivebox.misc.logging_util import SmartFormatter, accept_stdin
# from ..main import update # from ..main import update
# LEGACY VERSION:
# @enforce_types
# def update(resume: Optional[float]=None,
# only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
# index_only: bool=False,
# overwrite: bool=False,
# filter_patterns_str: Optional[str]=None,
# filter_patterns: Optional[List[str]]=None,
# filter_type: Optional[str]=None,
# status: Optional[str]=None,
# after: Optional[str]=None,
# before: Optional[str]=None,
# extractors: str="",
# out_dir: Path=DATA_DIR) -> List[Link]:
# """Import any new links from subscriptions and retry any previously failed/skipped links"""
# from core.models import ArchiveResult
# from .search import index_links
# # from workers.supervisord_util import start_cli_workers
# check_data_folder()
# # start_cli_workers()
# new_links: List[Link] = [] # TODO: Remove input argument: only_new
# extractors = extractors.split(",") if extractors else []
# # Step 1: Filter for selected_links
# print('[*] Finding matching Snapshots to update...')
# print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
# matching_snapshots = list_links(
# filter_patterns=filter_patterns,
# filter_type=filter_type,
# before=before,
# after=after,
# )
# print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
# matching_folders = list_folders(
# links=matching_snapshots,
# status=status,
# out_dir=out_dir,
# )
# all_links = (link for link in matching_folders.values() if link)
# print(' - Sorting by most unfinished -> least unfinished + date archived...')
# all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
# if index_only:
# for link in all_links:
# write_link_details(link, out_dir=out_dir, skip_sql_index=True)
# index_links(all_links, out_dir=out_dir)
# return all_links
# # Step 2: Run the archive methods for each link
# to_archive = new_links if only_new else all_links
# if resume:
# to_archive = [
# link for link in to_archive
# if link.timestamp >= str(resume)
# ]
# if not to_archive:
# stderr('')
# stderr(f'[√] Nothing found to resume after {resume}', color='green')
# return all_links
# archive_kwargs = {
# "out_dir": out_dir,
# }
# if extractors:
# archive_kwargs["methods"] = extractors
# archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
# # Step 4: Re-write links index with updated titles, icons, and resources
# all_links = load_main_index(out_dir=out_dir)
# return all_links
def update(): def update():
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from archivebox.config.django import setup_django from archivebox.config.django import setup_django
setup_django() setup_django()

View file

@ -1,61 +1,207 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
__command__ = 'archivebox version'
import sys import sys
import argparse from typing import Iterable
from pathlib import Path
from typing import Optional, List, IO
# from archivebox.misc.util import docstring import rich_click as click
from archivebox.config import DATA_DIR, VERSION
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.util import docstring, enforce_types
# @docstring(version.__doc__) @enforce_types
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: def version(quiet: bool=False,
"""Print the ArchiveBox version and dependency information""" binproviders: Iterable[str]=(),
parser = argparse.ArgumentParser( binaries: Iterable[str]=()) -> list[str]:
prog=__command__, """Print the ArchiveBox version, debug metadata, and installed dependency versions"""
description="Print the ArchiveBox version and dependency information", # version.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help='Only print ArchiveBox version number and nothing else.',
)
parser.add_argument(
'--binproviders', '-p',
type=str,
help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)',
default=None,
)
parser.add_argument(
'--binaries', '-b',
type=str,
help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)',
default=None,
)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
# for speed reasons, check if quiet flag was set and just return simple version immediately if so # fast path for just getting the version and exiting, dont do any slower imports
if command.quiet: from archivebox.config.version import VERSION
print(VERSION) print(VERSION)
return if quiet or '--version' in sys.argv:
return []
# otherwise do big expensive import to get the full version # Only do slower imports when getting full version info
from ..main import version import os
version( import platform
quiet=command.quiet, from pathlib import Path
out_dir=Path(pwd) if pwd else DATA_DIR,
binproviders=command.binproviders.split(',') if command.binproviders else None, from rich.panel import Panel
binaries=command.binaries.split(',') if command.binaries else None, from rich.console import Console
from abx_pkg import Binary
import abx
import archivebox
from archivebox.config import CONSTANTS, DATA_DIR
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
from archivebox.config.paths import get_data_locations, get_code_locations
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.misc.logging_util import printable_folder_status
from abx_plugin_default_binproviders import apt, brew, env
console = Console()
prnt = console.print
LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
# 0.7.1
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
p = platform.uname()
COMMIT_HASH = get_COMMIT_HASH()
prnt(
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={get_BUILD_TIME()}',
) )
prnt(
f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
f'ARCH={p.machine}',
f'OS={p.system}',
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
)
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
)
prnt(
f'DEBUG={SHELL_CONFIG.DEBUG}',
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
f'SUDO={CONSTANTS.IS_ROOT}',
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP_ENABLED}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
)
prnt()
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
PANEL_TEXT = '\n'.join((
# '',
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
'',
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
'',
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
'',
))
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
prnt()
return []
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
failures = []
BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
for name, binary in list(BINARIES.items()):
if binary.name == 'archivebox':
continue
# skip if the binary is not in the requested list of binaries
if binaries and binary.name not in binaries:
continue
# skip if the binary is not supported by any of the requested binproviders
if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
continue
err = None
try:
loaded_bin = binary.load()
except Exception as e:
err = e
loaded_bin = binary
provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
if loaded_bin.abspath:
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
if ' ' in abspath:
abspath = abspath.replace(' ', r'\ ')
else:
abspath = f'[red]{err}[/red]'
prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
if not loaded_bin.is_valid:
failures.append(loaded_bin.name)
prnt()
prnt('[gold3][i] Package Managers:[/gold3]')
BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
for name, binprovider in list(BINPROVIDERS.items()):
err = None
if binproviders and binprovider.name not in binproviders:
continue
# TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
abspath = None
if loaded_bin.abspath:
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
if ' ' in abspath:
abspath = abspath.replace(' ', r'\ ')
PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)
if not (binaries or binproviders):
# dont show source code / data dir info if we just want to get version info for a binary or binprovider
prnt()
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
for name, path in get_code_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt()
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
for name, path in get_data_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
from archivebox.misc.checks import check_data_dir_permissions
check_data_dir_permissions()
else:
prnt()
prnt('[red][i] Data locations:[/red] (not in a data directory)')
prnt()
if failures:
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
prnt(f' [red]{", ".join(failures)}[/red]')
prnt()
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
prnt(' [green]archivebox install[/green]')
prnt()
return failures
@click.command()
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
@click.option('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
@docstring(version.__doc__)
def main(**kwargs):
failures = version(**kwargs)
if failures:
raise SystemExit(1)
if __name__ == '__main__': if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin) main()

View file

@ -60,7 +60,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
return return
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS: with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=False) INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission

View file

@ -142,7 +142,7 @@ def create_and_chown_dir(dir_path: Path) -> None:
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &') os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
@cache @cache
def get_or_create_working_tmp_dir(autofix=True, quiet=False): def get_or_create_working_tmp_dir(autofix=True, quiet=True):
from archivebox import CONSTANTS from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_tmp_dir from archivebox.misc.checks import check_tmp_dir
@ -165,7 +165,7 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=False):
pass pass
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True): if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.TMP_DIR != candidate: if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet) STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
return candidate return candidate
if not quiet: if not quiet:
@ -193,7 +193,7 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False):
pass pass
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True): if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.LIB_DIR != candidate: if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet) STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
return candidate return candidate
if not quiet: if not quiet:

View file

@ -36,6 +36,8 @@ HOSTNAME: str = max([socket.gethostname(), platform.node()], key=len)
IS_ROOT = RUNNING_AS_UID == 0 IS_ROOT = RUNNING_AS_UID == 0
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose
FALLBACK_UID = RUNNING_AS_UID or SUDO_UID FALLBACK_UID = RUNNING_AS_UID or SUDO_UID
FALLBACK_GID = RUNNING_AS_GID or SUDO_GID FALLBACK_GID = RUNNING_AS_GID or SUDO_GID

View file

@ -303,7 +303,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Exit Status": [], "Exit Status": [],
} }
from workers.supervisor_util import get_existing_supervisord_process from workers.supervisord_util import get_existing_supervisord_process
supervisor = get_existing_supervisord_process() supervisor = get_existing_supervisord_process()
if supervisor is None: if supervisor is None:
@ -373,7 +373,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, "Must be a superuser to view configuration settings." assert request.user.is_superuser, "Must be a superuser to view configuration settings."
from workers.supervisor_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME from workers.supervisord_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME
SOCK_FILE = get_sock_file() SOCK_FILE = get_sock_file()
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME

View file

@ -21,7 +21,6 @@ from archivebox.misc.logging_util import printable_filesize
from archivebox.search.admin import SearchResultsAdminMixin from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.index.html import snapshot_icons from archivebox.index.html import snapshot_icons
from archivebox.extractors import archive_links from archivebox.extractors import archive_links
from archivebox.main import remove
from archivebox.base_models.admin import ABIDModelAdmin from archivebox.base_models.admin import ABIDModelAdmin
from archivebox.workers.tasks import bg_archive_links, bg_add from archivebox.workers.tasks import bg_archive_links, bg_add
@ -321,7 +320,9 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
description="☠️ Delete" description="☠️ Delete"
) )
def delete_snapshots(self, request, queryset): def delete_snapshots(self, request, queryset):
from archivebox.cli.archivebox_remove import remove
remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR) remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
messages.success( messages.success(
request, request,
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),

File diff suppressed because it is too large Load diff

View file

@ -24,7 +24,7 @@ def check_data_folder() -> None:
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir() archive_dir_exists = os.path.isdir(ARCHIVE_DIR)
if not archive_dir_exists: if not archive_dir_exists:
print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr) print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
print(f' {DATA_DIR}', file=sys.stderr) print(f' {DATA_DIR}', file=sys.stderr)

View file

@ -12,7 +12,7 @@ from pathlib import Path
from datetime import datetime, timezone from datetime import datetime, timezone
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
if TYPE_CHECKING: if TYPE_CHECKING:
from ..index.schema import Link, ArchiveResult from ..index.schema import Link, ArchiveResult
@ -228,7 +228,7 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
print() print()
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str='.'): def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'):
args = ' '.join(subcommand_args) args = ' '.join(subcommand_args)
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format( version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),

View file

@ -20,11 +20,9 @@ from datetime import datetime, timedelta # noqa
from django.conf import settings # noqa from django.conf import settings # noqa
from archivebox import CONSTANTS # noqa from archivebox import CONSTANTS # noqa
from ..main import * # noqa from archivebox.cli import * # noqa
from ..cli import CLI_SUBCOMMANDS
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG() CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
CLI_COMMAND_NAMES = ", ".join(CLI_SUBCOMMANDS.keys())
if __name__ == '__main__': if __name__ == '__main__':
# load the rich extension for ipython for pretty printing # load the rich extension for ipython for pretty printing
@ -40,7 +38,7 @@ if __name__ == '__main__':
prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]') prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]')
prnt('[yellow4]# ArchiveBox Imports[/]') prnt('[yellow4]# ArchiveBox Imports[/]')
prnt('[yellow4]import archivebox[/]') prnt('[yellow4]import archivebox[/]')
prnt('[yellow4]from archivebox.main import {}[/]'.format(CLI_COMMAND_NAMES)) prnt('[yellow4]from archivebox.cli import *[/]')
prnt() prnt()
if console.width >= 80: if console.width >= 80:

View file

@ -459,8 +459,8 @@ def load_plugins(plugins: Iterable[PluginId | ModuleType | Type] | Dict[PluginId
PLUGINS_TO_LOAD = sorted(PLUGINS_TO_LOAD, key=lambda x: x['order']) PLUGINS_TO_LOAD = sorted(PLUGINS_TO_LOAD, key=lambda x: x['order'])
for plugin_info in PLUGINS_TO_LOAD: for plugin_info in PLUGINS_TO_LOAD:
if '--version' not in sys.argv and '--help' not in sys.argv: # if '--version' not in sys.argv and '--help' not in sys.argv:
print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr) # print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr)
pm.register(plugin_info['module']) pm.register(plugin_info['module'])
LOADED_PLUGINS[plugin_info['id']] = plugin_info LOADED_PLUGINS[plugin_info['id']] = plugin_info
# print('\x1b[2K', end='\r', flush=True, file=sys.stderr) # print('\x1b[2K', end='\r', flush=True, file=sys.stderr)

View file

@ -1,103 +1,103 @@
import uuid # import uuid
from functools import wraps # from functools import wraps
from django.db import connection, transaction # from django.db import connection, transaction
from django.utils import timezone # from django.utils import timezone
from huey.exceptions import TaskLockedException # from huey.exceptions import TaskLockedException
from archivebox.config import CONSTANTS # from archivebox.config import CONSTANTS
class SqliteSemaphore: # class SqliteSemaphore:
def __init__(self, db_path, table_name, name, value=1, timeout=None): # def __init__(self, db_path, table_name, name, value=1, timeout=None):
self.db_path = db_path # self.db_path = db_path
self.table_name = table_name # self.table_name = table_name
self.name = name # self.name = name
self.value = value # self.value = value
self.timeout = timeout or 86400 # Set a max age for lock holders # self.timeout = timeout or 86400 # Set a max age for lock holders
# Ensure the table exists # # Ensure the table exists
with connection.cursor() as cursor: # with connection.cursor() as cursor:
cursor.execute(f""" # cursor.execute(f"""
CREATE TABLE IF NOT EXISTS {self.table_name} ( # CREATE TABLE IF NOT EXISTS {self.table_name} (
id TEXT PRIMARY KEY, # id TEXT PRIMARY KEY,
name TEXT, # name TEXT,
timestamp DATETIME # timestamp DATETIME
) # )
""") # """)
def acquire(self, name=None): # def acquire(self, name=None):
name = name or str(uuid.uuid4()) # name = name or str(uuid.uuid4())
now = timezone.now() # now = timezone.now()
expiration = now - timezone.timedelta(seconds=self.timeout) # expiration = now - timezone.timedelta(seconds=self.timeout)
with transaction.atomic(): # with transaction.atomic():
# Remove expired locks # # Remove expired locks
with connection.cursor() as cursor: # with connection.cursor() as cursor:
cursor.execute(f""" # cursor.execute(f"""
DELETE FROM {self.table_name} # DELETE FROM {self.table_name}
WHERE name = %s AND timestamp < %s # WHERE name = %s AND timestamp < %s
""", [self.name, expiration]) # """, [self.name, expiration])
# Try to acquire the lock # # Try to acquire the lock
with connection.cursor() as cursor: # with connection.cursor() as cursor:
cursor.execute(f""" # cursor.execute(f"""
INSERT INTO {self.table_name} (id, name, timestamp) # INSERT INTO {self.table_name} (id, name, timestamp)
SELECT %s, %s, %s # SELECT %s, %s, %s
WHERE ( # WHERE (
SELECT COUNT(*) FROM {self.table_name} # SELECT COUNT(*) FROM {self.table_name}
WHERE name = %s # WHERE name = %s
) < %s # ) < %s
""", [name, self.name, now, self.name, self.value]) # """, [name, self.name, now, self.name, self.value])
if cursor.rowcount > 0: # if cursor.rowcount > 0:
return name # return name
# If we couldn't acquire the lock, remove our attempted entry # # If we couldn't acquire the lock, remove our attempted entry
with connection.cursor() as cursor: # with connection.cursor() as cursor:
cursor.execute(f""" # cursor.execute(f"""
DELETE FROM {self.table_name} # DELETE FROM {self.table_name}
WHERE id = %s AND name = %s # WHERE id = %s AND name = %s
""", [name, self.name]) # """, [name, self.name])
return None # return None
def release(self, name): # def release(self, name):
with connection.cursor() as cursor: # with connection.cursor() as cursor:
cursor.execute(f""" # cursor.execute(f"""
DELETE FROM {self.table_name} # DELETE FROM {self.table_name}
WHERE id = %s AND name = %s # WHERE id = %s AND name = %s
""", [name, self.name]) # """, [name, self.name])
return cursor.rowcount > 0 # return cursor.rowcount > 0
LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3' # LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3'
def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None): # def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None):
""" # """
Lock which can be acquired multiple times (default = 1). # Lock which can be acquired multiple times (default = 1).
NOTE: no provisions are made for blocking, waiting, or notifying. This is # NOTE: no provisions are made for blocking, waiting, or notifying. This is
just a lock which can be acquired a configurable number of times. # just a lock which can be acquired a configurable number of times.
Example: # Example:
# Allow up to 3 workers to run this task concurrently. If the task is # # Allow up to 3 workers to run this task concurrently. If the task is
# locked, retry up to 2 times with a delay of 60s. # # locked, retry up to 2 times with a delay of 60s.
@huey.task(retries=2, retry_delay=60) # @huey.task(retries=2, retry_delay=60)
@lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3) # @lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3)
def my_task(): # def my_task():
... # ...
""" # """
sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout) # sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout)
def decorator(fn): # def decorator(fn):
@wraps(fn) # @wraps(fn)
def inner(*args, **kwargs): # def inner(*args, **kwargs):
tid = sem.acquire() # tid = sem.acquire()
if tid is None: # if tid is None:
raise TaskLockedException(f'unable to acquire lock {lock_name}') # raise TaskLockedException(f'unable to acquire lock {lock_name}')
try: # try:
return fn(*args, **kwargs) # return fn(*args, **kwargs)
finally: # finally:
sem.release(tid) # sem.release(tid)
return inner # return inner
return decorator # return decorator

View file

@ -8,7 +8,7 @@ from django_huey import db_task, task
from huey_monitor.models import TaskModel from huey_monitor.models import TaskModel
from huey_monitor.tqdm import ProcessInfo from huey_monitor.tqdm import ProcessInfo
from .supervisor_util import get_or_create_supervisord_process from .supervisord_util import get_or_create_supervisord_process
# @db_task(queue="commands", context=True, schedule=1) # @db_task(queue="commands", context=True, schedule=1)
# def scheduler_tick(): # def scheduler_tick():

View file

@ -115,6 +115,8 @@ dependencies = [
"abx-plugin-mercury>=2024.10.28", "abx-plugin-mercury>=2024.10.28",
"abx-plugin-htmltotext>=2024.10.28", "abx-plugin-htmltotext>=2024.10.28",
"python-statemachine>=2.3.6", "python-statemachine>=2.3.6",
"click>=8.1.7",
"rich-click>=1.8.4",
] ]
[project.optional-dependencies] [project.optional-dependencies]

18
uv.lock
View file

@ -658,6 +658,7 @@ dependencies = [
{ name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels", extra = ["daphne"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels", extra = ["daphne"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "croniter", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "croniter", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "dateparser", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "dateparser", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@ -688,6 +689,7 @@ dependencies = [
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich-argparse", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "rich-argparse", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "sonic-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "sonic-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "supervisor", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "supervisor", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@ -784,6 +786,7 @@ requires-dist = [
{ name = "atomicwrites", specifier = "==1.4.1" }, { name = "atomicwrites", specifier = "==1.4.1" },
{ name = "base32-crockford", specifier = "==0.3.0" }, { name = "base32-crockford", specifier = "==0.3.0" },
{ name = "channels", extras = ["daphne"], specifier = ">=4.1.0" }, { name = "channels", extras = ["daphne"], specifier = ">=4.1.0" },
{ name = "click", specifier = ">=8.1.7" },
{ name = "croniter", specifier = ">=3.0.3" }, { name = "croniter", specifier = ">=3.0.3" },
{ name = "dateparser", specifier = ">=1.2.0" }, { name = "dateparser", specifier = ">=1.2.0" },
{ name = "django", specifier = ">=5.1.1,<6.0" }, { name = "django", specifier = ">=5.1.1,<6.0" },
@ -821,6 +824,7 @@ requires-dist = [
{ name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" }, { name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" },
{ name = "rich", specifier = ">=13.8.0" }, { name = "rich", specifier = ">=13.8.0" },
{ name = "rich-argparse", specifier = ">=1.5.2" }, { name = "rich-argparse", specifier = ">=1.5.2" },
{ name = "rich-click", specifier = ">=1.8.4" },
{ name = "setuptools", specifier = ">=74.1.0" }, { name = "setuptools", specifier = ">=74.1.0" },
{ name = "sonic-client", specifier = ">=1.0.0" }, { name = "sonic-client", specifier = ">=1.0.0" },
{ name = "supervisor", specifier = ">=4.2.5" }, { name = "supervisor", specifier = ">=4.2.5" },
@ -2806,6 +2810,20 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/25/45/54b95bb72bb17c27a7252bee5034955020b5869a33918b660ffc29cbf608/rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7", size = 20072 }, { url = "https://files.pythonhosted.org/packages/25/45/54b95bb72bb17c27a7252bee5034955020b5869a33918b660ffc29cbf608/rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7", size = 20072 },
] ]
[[package]]
name = "rich-click"
version = "1.8.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/fc/f4/e48dc2850662526a26fb0961aacb0162c6feab934312b109b748ae4efee2/rich_click-1.8.4.tar.gz", hash = "sha256:0f49471f04439269d0e66a6f43120f52d11d594869a2a0be600cfb12eb0616b9", size = 38247 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/84/f3/72f93d8494ee641bde76bfe1208cf4abc44c6f9448673762f6077bc162d6/rich_click-1.8.4-py3-none-any.whl", hash = "sha256:2d2841b3cebe610d5682baa1194beaf78ab00c4fa31931533261b5eba2ee80b7", size = 35071 },
]
[[package]] [[package]]
name = "ruff" name = "ruff"
version = "0.7.4" version = "0.7.4"