mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 11:43:07 +00:00
move main funcs into cli files and switch to using click for CLI
This commit is contained in:
parent
569081a9eb
commit
328eb98a38
35 changed files with 1885 additions and 2296 deletions
|
@ -51,6 +51,7 @@ from .pkgs import load_vendored_pkgs # noqa
|
|||
load_vendored_pkgs()
|
||||
# print('DONE LOADING VENDORED LIBRARIES')
|
||||
|
||||
# print('LOADING ABX PLUGIN SPECIFICATIONS')
|
||||
# Load ABX Plugin Specifications + Default Implementations
|
||||
import abx # noqa
|
||||
import abx_spec_archivebox # noqa
|
||||
|
@ -74,7 +75,7 @@ abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
|
|||
# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
|
||||
abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
|
||||
pm = abx.pm
|
||||
|
||||
# print('DONE LOADING ABX PLUGIN SPECIFICATIONS')
|
||||
|
||||
# Load all pip-installed ABX-compatible plugins
|
||||
ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
|
||||
|
@ -94,7 +95,9 @@ USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
|
|||
|
||||
# Import all plugins and register them with ABX Plugin Manager
|
||||
ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
|
||||
# print('LOADING ALL PLUGINS')
|
||||
LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
|
||||
# print('DONE LOADING ALL PLUGINS')
|
||||
|
||||
# Setup basic config, constants, paths, and version
|
||||
from .config.constants import CONSTANTS # noqa
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env python3
|
||||
"""This is the main entry point for the ArchiveBox CLI."""
|
||||
"""This is the entrypoint for python -m archivebox ..."""
|
||||
__package__ = 'archivebox'
|
||||
|
||||
import archivebox # noqa # make sure monkey patches are applied before anything else
|
||||
|
@ -15,5 +15,4 @@ ASCII_LOGO_MINI = r"""
|
|||
/_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
|
||||
"""
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -6,13 +6,6 @@ from enum import Enum
|
|||
|
||||
from ninja import Router, Schema
|
||||
|
||||
from archivebox.main import (
|
||||
add,
|
||||
remove,
|
||||
update,
|
||||
list_all,
|
||||
schedule,
|
||||
)
|
||||
from archivebox.misc.util import ansi_to_html
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
@ -60,13 +53,11 @@ class AddCommandSchema(Schema):
|
|||
urls: List[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
|
||||
update_all: bool = False
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
init: bool = False
|
||||
extractors: str = ""
|
||||
parser: str = "auto"
|
||||
extract: str = ""
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
|
||||
overwrite: bool = False
|
||||
index_only: bool = False
|
||||
|
||||
class UpdateCommandSchema(Schema):
|
||||
resume: Optional[float] = 0
|
||||
|
@ -93,7 +84,7 @@ class ScheduleCommandSchema(Schema):
|
|||
class ListCommandSchema(Schema):
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
filter_type: str = FilterTypeChoices.substring
|
||||
status: Optional[StatusChoices] = StatusChoices.indexed
|
||||
status: StatusChoices = StatusChoices.indexed
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
sort: str = 'bookmarked_at'
|
||||
|
@ -115,16 +106,16 @@ class RemoveCommandSchema(Schema):
|
|||
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||
def cli_add(request, args: AddCommandSchema):
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
result = add(
|
||||
urls=args.urls,
|
||||
tag=args.tag,
|
||||
depth=args.depth,
|
||||
update=args.update,
|
||||
update_all=args.update_all,
|
||||
index_only=args.index_only,
|
||||
overwrite=args.overwrite,
|
||||
init=args.init,
|
||||
extractors=args.extractors,
|
||||
extract=args.extract,
|
||||
parser=args.parser,
|
||||
)
|
||||
|
||||
|
@ -139,6 +130,8 @@ def cli_add(request, args: AddCommandSchema):
|
|||
|
||||
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||
def cli_update(request, args: UpdateCommandSchema):
|
||||
from archivebox.cli.archivebox_update import update
|
||||
|
||||
result = update(
|
||||
resume=args.resume,
|
||||
only_new=args.only_new,
|
||||
|
@ -162,6 +155,8 @@ def cli_update(request, args: UpdateCommandSchema):
|
|||
|
||||
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||
def cli_schedule(request, args: ScheduleCommandSchema):
|
||||
from archivebox.cli.archivebox_schedule import schedule
|
||||
|
||||
result = schedule(
|
||||
import_path=args.import_path,
|
||||
add=args.add,
|
||||
|
@ -184,9 +179,11 @@ def cli_schedule(request, args: ScheduleCommandSchema):
|
|||
|
||||
|
||||
|
||||
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns] (use this endpoint with ?filter_type=search to search for snapshots)')
|
||||
def cli_list(request, args: ListCommandSchema):
|
||||
result = list_all(
|
||||
@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
|
||||
def cli_search(request, args: ListCommandSchema):
|
||||
from archivebox.cli.archivebox_search import search
|
||||
|
||||
result = search(
|
||||
filter_patterns=args.filter_patterns,
|
||||
filter_type=args.filter_type,
|
||||
status=args.status,
|
||||
|
@ -221,6 +218,8 @@ def cli_list(request, args: ListCommandSchema):
|
|||
|
||||
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||
def cli_remove(request, args: RemoveCommandSchema):
|
||||
from archivebox.cli.archivebox_remove import remove
|
||||
|
||||
result = remove(
|
||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||
delete=args.delete,
|
||||
|
|
|
@ -1,264 +1,117 @@
|
|||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import threading
|
||||
|
||||
from time import sleep
|
||||
from collections.abc import Mapping
|
||||
|
||||
from rich import print
|
||||
|
||||
from typing import Optional, List, IO, Union, Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from importlib import import_module
|
||||
|
||||
BUILTIN_LIST = list
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
|
||||
from archivebox.config.version import VERSION
|
||||
|
||||
CLI_DIR = Path(__file__).resolve().parent
|
||||
|
||||
# rewrite setup -> install for backwards compatibility
|
||||
if len(sys.argv) > 1 and sys.argv[1] == 'setup':
|
||||
from rich import print
|
||||
print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
|
||||
sys.argv[1] = 'install'
|
||||
|
||||
if '--debug' in sys.argv:
|
||||
os.environ['DEBUG'] = 'True'
|
||||
sys.argv.remove('--debug')
|
||||
|
||||
|
||||
# def list_subcommands() -> Dict[str, str]:
|
||||
# """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
|
||||
# COMMANDS = []
|
||||
# for filename in os.listdir(CLI_DIR):
|
||||
# if is_cli_module(filename):
|
||||
# subcommand = filename.replace('archivebox_', '').replace('.py', '')
|
||||
# module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
# assert is_valid_cli_module(module, subcommand)
|
||||
# COMMANDS.append((subcommand, module.main.__doc__))
|
||||
# globals()[subcommand] = module.main
|
||||
# display_order = lambda cmd: (
|
||||
# display_first.index(cmd[0])
|
||||
# if cmd[0] in display_first else
|
||||
# 100 + len(cmd[0])
|
||||
# )
|
||||
# return dict(sorted(COMMANDS, key=display_order))
|
||||
|
||||
# just define it statically, it's much faster:
|
||||
SUBCOMMAND_MODULES = {
|
||||
'help': 'archivebox_help',
|
||||
'version': 'archivebox_version' ,
|
||||
|
||||
'init': 'archivebox_init',
|
||||
'install': 'archivebox_install',
|
||||
##############################################
|
||||
'config': 'archivebox_config',
|
||||
'add': 'archivebox_add',
|
||||
'remove': 'archivebox_remove',
|
||||
'update': 'archivebox_update',
|
||||
'list': 'archivebox_list',
|
||||
'status': 'archivebox_status',
|
||||
|
||||
'schedule': 'archivebox_schedule',
|
||||
'server': 'archivebox_server',
|
||||
'shell': 'archivebox_shell',
|
||||
'manage': 'archivebox_manage',
|
||||
|
||||
# 'oneshot': 'archivebox_oneshot',
|
||||
}
|
||||
|
||||
# every imported command module must have these properties in order to be valid
|
||||
required_attrs = ('__package__', '__command__', 'main')
|
||||
|
||||
# basic checks to make sure imported files are valid subcommands
|
||||
is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
|
||||
is_valid_cli_module = lambda module, subcommand: (
|
||||
all(hasattr(module, attr) for attr in required_attrs)
|
||||
and module.__command__.split(' ')[-1] == subcommand
|
||||
)
|
||||
|
||||
class LazySubcommands(Mapping):
|
||||
def keys(self):
|
||||
return SUBCOMMAND_MODULES.keys()
|
||||
|
||||
def values(self):
|
||||
return [self[key] for key in self.keys()]
|
||||
|
||||
def items(self):
|
||||
return [(key, self[key]) for key in self.keys()]
|
||||
|
||||
def __getitem__(self, key):
|
||||
module = import_module(f'.{SUBCOMMAND_MODULES[key]}', __package__)
|
||||
assert is_valid_cli_module(module, key)
|
||||
return module.main
|
||||
|
||||
def __iter__(self):
|
||||
return iter(SUBCOMMAND_MODULES.keys())
|
||||
|
||||
def __len__(self):
|
||||
return len(SUBCOMMAND_MODULES)
|
||||
|
||||
CLI_SUBCOMMANDS = LazySubcommands()
|
||||
class ArchiveBoxGroup(click.Group):
|
||||
"""lazy loading click group for archivebox commands"""
|
||||
meta_commands = {
|
||||
'help': 'archivebox.cli.archivebox_help.main',
|
||||
'version': 'archivebox.cli.archivebox_version.main',
|
||||
}
|
||||
setup_commands = {
|
||||
'init': 'archivebox.cli.archivebox_init.main',
|
||||
'install': 'archivebox.cli.archivebox_install.main',
|
||||
}
|
||||
archive_commands = {
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'update': 'archivebox.cli.archivebox_update.main',
|
||||
'search': 'archivebox.cli.archivebox_search.main',
|
||||
'status': 'archivebox.cli.archivebox_status.main',
|
||||
'config': 'archivebox.cli.archivebox_config.main',
|
||||
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
**setup_commands,
|
||||
**archive_commands,
|
||||
}
|
||||
renamed_commands = {
|
||||
'setup': 'install',
|
||||
'list': 'search',
|
||||
'import': 'add',
|
||||
'archive': 'add',
|
||||
'export': 'search',
|
||||
}
|
||||
|
||||
|
||||
# these common commands will appear sorted before any others for ease-of-use
|
||||
meta_cmds = ('help', 'version') # dont require valid data folder at all
|
||||
setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet
|
||||
archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present
|
||||
fake_db = ("oneshot",) # use fake in-memory db
|
||||
def get_command(self, ctx, cmd_name):
|
||||
# handle renamed commands
|
||||
if cmd_name in self.renamed_commands:
|
||||
new_name = self.renamed_commands[cmd_name]
|
||||
print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
|
||||
cmd_name = new_name
|
||||
ctx.invoked_subcommand = cmd_name
|
||||
|
||||
display_first = (*meta_cmds, *setup_cmds, *archive_cmds)
|
||||
# handle lazy loading of commands
|
||||
if cmd_name in self.all_subcommands:
|
||||
return self._lazy_load(cmd_name)
|
||||
|
||||
# fall-back to using click's default command lookup
|
||||
return super().get_command(ctx, cmd_name)
|
||||
|
||||
@classmethod
|
||||
def _lazy_load(cls, cmd_name):
|
||||
import_path = cls.all_subcommands[cmd_name]
|
||||
modname, funcname = import_path.rsplit('.', 1)
|
||||
|
||||
# print(f'LAZY LOADING {import_path}')
|
||||
mod = import_module(modname)
|
||||
func = getattr(mod, funcname)
|
||||
|
||||
if not hasattr(func, '__doc__'):
|
||||
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
|
||||
|
||||
# if not isinstance(cmd, click.BaseCommand):
|
||||
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||
|
||||
return func
|
||||
|
||||
|
||||
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
|
||||
@click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
@click.version_option(version=VERSION, package_name='archivebox', message='%(version)s')
|
||||
@click.pass_context
|
||||
def cli(ctx, help=False):
|
||||
"""ArchiveBox: The self-hosted internet archive"""
|
||||
|
||||
if help or ctx.invoked_subcommand is None:
|
||||
ctx.invoke(ctx.command.get_command(ctx, 'help'))
|
||||
|
||||
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int:
|
||||
"""
|
||||
Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks.
|
||||
Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes.
|
||||
"""
|
||||
|
||||
wait_for_all: bool = thread_names == ()
|
||||
|
||||
thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns)
|
||||
|
||||
should_wait = lambda thread: (
|
||||
not thread_matches(thread, ignore_names)
|
||||
and (wait_for_all or thread_matches(thread, thread_names)))
|
||||
|
||||
for tries in range(timeout):
|
||||
all_threads = [*threading.enumerate()]
|
||||
blocking_threads = [*filter(should_wait, all_threads)]
|
||||
threads_summary = ', '.join(repr(t) for t in blocking_threads)
|
||||
if blocking_threads:
|
||||
sleep(1)
|
||||
if tries == 5: # only show stderr message if we need to wait more than 5s
|
||||
print(
|
||||
f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
|
||||
threads_summary,
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
return tries
|
||||
|
||||
raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}')
|
||||
|
||||
|
||||
|
||||
def run_subcommand(subcommand: str,
|
||||
subcommand_args: List[str] | None = None,
|
||||
stdin: Optional[IO]=None,
|
||||
pwd: Union[Path, str, None]=None) -> None:
|
||||
"""Run a given ArchiveBox subcommand with the given list of args"""
|
||||
|
||||
subcommand_args = subcommand_args or []
|
||||
|
||||
from archivebox.misc.checks import check_migrations
|
||||
if ctx.invoked_subcommand in ArchiveBoxGroup.archive_commands:
|
||||
# print('SETUP DJANGO AND CHECK DATA FOLDER')
|
||||
from archivebox.config.django import setup_django
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
|
||||
# print('DATA_DIR is', DATA_DIR)
|
||||
# print('pwd is', os.getcwd())
|
||||
|
||||
cmd_requires_db = (subcommand in archive_cmds)
|
||||
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
|
||||
|
||||
check_db = cmd_requires_db and not init_pending
|
||||
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
|
||||
|
||||
for ignore_pattern in ('help', '-h', '--help', 'version', '--version'):
|
||||
if ignore_pattern in sys.argv[:4]:
|
||||
cmd_requires_db = False
|
||||
break
|
||||
|
||||
if subcommand in archive_cmds:
|
||||
if cmd_requires_db:
|
||||
check_migrations()
|
||||
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
||||
|
||||
# wait for webhooks, signals, and other background jobs to finish before exit
|
||||
wait_for_bg_threads_to_exit(timeout=60)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class NotProvided:
|
||||
def __len__(self):
|
||||
return 0
|
||||
def __bool__(self):
|
||||
return False
|
||||
def __repr__(self):
|
||||
return '<not provided>'
|
||||
|
||||
Omitted = Union[None, NotProvided]
|
||||
|
||||
OMITTED = NotProvided()
|
||||
|
||||
|
||||
def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: str | None=None) -> None:
|
||||
# print('STARTING CLI MAIN ENTRYPOINT')
|
||||
|
||||
args = sys.argv[1:] if args is OMITTED else args
|
||||
stdin = sys.stdin if stdin is OMITTED else stdin
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description='ArchiveBox: The self-hosted internet archive',
|
||||
add_help=False,
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--help', '-h',
|
||||
action='store_true',
|
||||
help=CLI_SUBCOMMANDS['help'].__doc__,
|
||||
)
|
||||
group.add_argument(
|
||||
'--version',
|
||||
action='store_true',
|
||||
help=CLI_SUBCOMMANDS['version'].__doc__,
|
||||
)
|
||||
group.add_argument(
|
||||
"subcommand",
|
||||
type=str,
|
||||
help= "The name of the subcommand to run",
|
||||
nargs='?',
|
||||
choices=CLI_SUBCOMMANDS.keys(),
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"subcommand_args",
|
||||
help="Arguments for the subcommand",
|
||||
nargs=argparse.REMAINDER,
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
|
||||
if command.version:
|
||||
command.subcommand = 'version'
|
||||
elif command.help or command.subcommand is None:
|
||||
command.subcommand = 'help'
|
||||
|
||||
if command.subcommand not in ('version',):
|
||||
from archivebox.misc.logging_util import log_cli_command
|
||||
|
||||
log_cli_command(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
stdin=stdin or None,
|
||||
)
|
||||
def main(args=None, prog_name=None):
|
||||
# show `docker run archivebox xyz` in help messages if running in docker
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox')
|
||||
|
||||
try:
|
||||
run_subcommand(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
stdin=stdin or None,
|
||||
)
|
||||
cli(args=args, prog_name=prog_name)
|
||||
except KeyboardInterrupt:
|
||||
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -4,10 +4,10 @@ __package__ = 'archivebox.cli'
|
|||
__command__ = 'archivebox add'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import IO, TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from django.utils import timezone
|
||||
from django.db.models import QuerySet
|
||||
|
@ -18,7 +18,6 @@ from archivebox.config.common import ARCHIVING_CONFIG
|
|||
from archivebox.config.django import setup_django
|
||||
from archivebox.config.permissions import USER, HOSTNAME
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
|
||||
from archivebox.parsers import PARSERS
|
||||
|
||||
|
||||
|
@ -29,22 +28,142 @@ if TYPE_CHECKING:
|
|||
ORCHESTRATOR = None
|
||||
|
||||
|
||||
# OLD VERSION:
|
||||
# def add(urls: Union[str, List[str]],
|
||||
# tag: str='',
|
||||
# depth: int=0,
|
||||
# update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
# update_all: bool=False,
|
||||
# index_only: bool=False,
|
||||
# overwrite: bool=False,
|
||||
# # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
|
||||
# init: bool=False,
|
||||
# extractors: str="",
|
||||
# parser: str="auto",
|
||||
# created_by_id: int | None=None,
|
||||
# out_dir: Path=DATA_DIR) -> List[Link]:
|
||||
# """Add a new URL or list of URLs to your archive"""
|
||||
|
||||
# from core.models import Snapshot, Tag
|
||||
# # from workers.supervisord_util import start_cli_workers, tail_worker_logs
|
||||
# # from workers.tasks import bg_archive_link
|
||||
|
||||
|
||||
# assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
# extractors = extractors.split(",") if extractors else []
|
||||
|
||||
# if init:
|
||||
# run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
|
||||
# # Load list of links from the existing index
|
||||
# check_data_folder()
|
||||
|
||||
# # worker = start_cli_workers()
|
||||
|
||||
# new_links: List[Link] = []
|
||||
# all_links = load_main_index(out_dir=out_dir)
|
||||
|
||||
# log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
||||
# if isinstance(urls, str):
|
||||
# # save verbatim stdin to sources
|
||||
# write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
|
||||
# elif isinstance(urls, list):
|
||||
# # save verbatim args to sources
|
||||
# write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||
|
||||
|
||||
# new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
|
||||
|
||||
# # If we're going one level deeper, download each link and look for more links
|
||||
# new_links_depth = []
|
||||
# if new_links and depth == 1:
|
||||
# log_crawl_started(new_links)
|
||||
# for new_link in new_links:
|
||||
# try:
|
||||
# downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
# new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
# except Exception as err:
|
||||
# stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
|
||||
|
||||
# imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||
|
||||
# new_links = dedupe_links(all_links, imported_links)
|
||||
|
||||
# write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
|
||||
# all_links = load_main_index(out_dir=out_dir)
|
||||
|
||||
# tags = [
|
||||
# Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
|
||||
# for name in tag.split(',')
|
||||
# if name.strip()
|
||||
# ]
|
||||
# if tags:
|
||||
# for link in imported_links:
|
||||
# snapshot = Snapshot.objects.get(url=link.url)
|
||||
# snapshot.tags.add(*tags)
|
||||
# snapshot.tags_str(nocache=True)
|
||||
# snapshot.save()
|
||||
# # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
|
||||
|
||||
# if index_only:
|
||||
# # mock archive all the links using the fake index_only extractor method in order to update their state
|
||||
# if overwrite:
|
||||
# archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
|
||||
# else:
|
||||
# archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
|
||||
# else:
|
||||
# # fully run the archive extractor methods for each link
|
||||
# archive_kwargs = {
|
||||
# "out_dir": out_dir,
|
||||
# "created_by_id": created_by_id,
|
||||
# }
|
||||
# if extractors:
|
||||
# archive_kwargs["methods"] = extractors
|
||||
|
||||
# stderr()
|
||||
|
||||
# ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# if update:
|
||||
# stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
# archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
|
||||
# elif update_all:
|
||||
# stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
|
||||
# archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
# elif overwrite:
|
||||
# stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
# archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
# elif new_links:
|
||||
# stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
|
||||
# archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
|
||||
# # tail_worker_logs(worker['stdout_logfile'])
|
||||
|
||||
# # if CAN_UPGRADE:
|
||||
# # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
# return new_links
|
||||
|
||||
|
||||
|
||||
def add(urls: str | list[str],
|
||||
tag: str='',
|
||||
depth: int=0,
|
||||
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
update_all: bool=False,
|
||||
index_only: bool=False,
|
||||
overwrite: bool=False,
|
||||
extractors: str="",
|
||||
tag: str='',
|
||||
parser: str="auto",
|
||||
extract: str="",
|
||||
persona: str='Default',
|
||||
overwrite: bool=False,
|
||||
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
index_only: bool=False,
|
||||
bg: bool=False,
|
||||
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
global ORCHESTRATOR
|
||||
|
||||
depth = int(depth)
|
||||
|
||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
# 0. setup abx, django, check_data_folder
|
||||
|
@ -56,7 +175,6 @@ def add(urls: str | list[str],
|
|||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
|
||||
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
|
@ -72,7 +190,7 @@ def add(urls: str | list[str],
|
|||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'EXTRACTORS': extractors,
|
||||
'EXTRACTORS': extract,
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
})
|
||||
# 3. create a new Crawl pointing to the Seed
|
||||
|
@ -91,118 +209,23 @@ def add(urls: str | list[str],
|
|||
return crawl.snapshot_set.all()
|
||||
|
||||
|
||||
def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=None) -> None:
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
|
||||
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||
# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
|
||||
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
|
||||
@click.argument('urls', nargs=-1, type=click.Path())
|
||||
def main(**kwargs):
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=add.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--tag', '-t',
|
||||
type=str,
|
||||
default='',
|
||||
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update', #'-u',
|
||||
action='store_true',
|
||||
default=not ARCHIVING_CONFIG.ONLY_NEW, # when ONLY_NEW=True we skip updating old links
|
||||
help="Also retry previously skipped/failed links when adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update-all', #'-n',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Also update ALL links in index when finished adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--index-only', #'-o',
|
||||
action='store_true',
|
||||
help="Add the links to the main index without archiving them",
|
||||
)
|
||||
parser.add_argument(
|
||||
'urls',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'URLs or paths to archive e.g.:\n'
|
||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||
' https://example.com/some/rss/feed.xml\n'
|
||||
' https://example.com\n'
|
||||
' ~/Downloads/firefox_bookmarks_export.html\n'
|
||||
' ~/Desktop/sites_list.csv\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--depth",
|
||||
action="store",
|
||||
default=0,
|
||||
choices=[0, 1],
|
||||
type=int,
|
||||
help="Recursively archive all linked pages up to this many hops away"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Re-archive URLs from scratch, overwriting any existing files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract", '-e',
|
||||
type=str,
|
||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||
This does not take precedence over the configuration",
|
||||
default=""
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parser",
|
||||
type=str,
|
||||
help="Parser used to read inputted URLs.",
|
||||
default="auto",
|
||||
choices=["auto", *PARSERS.keys()],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--persona",
|
||||
type=str,
|
||||
help="Name of accounts persona to use when archiving.",
|
||||
default="Default",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bg",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Enqueue a background worker to complete the crawl instead of running it immediately",
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
urls = command.urls
|
||||
|
||||
stdin_urls = ''
|
||||
if not urls:
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
|
||||
if (stdin_urls and urls) or (not stdin and not urls):
|
||||
stderr(
|
||||
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
add(
|
||||
urls=stdin_urls or urls,
|
||||
depth=command.depth,
|
||||
tag=command.tag,
|
||||
update=command.update,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
extractors=command.extract,
|
||||
parser=command.parser,
|
||||
persona=command.persona,
|
||||
bg=command.bg,
|
||||
)
|
||||
add(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
main()
|
||||
|
|
|
@ -12,7 +12,130 @@ from typing import Optional, List, IO
|
|||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||
from ..main import config
|
||||
|
||||
|
||||
|
||||
# @enforce_types
|
||||
def config(config_options_str: Optional[str]=None,
|
||||
config_options: Optional[List[str]]=None,
|
||||
get: bool=False,
|
||||
set: bool=False,
|
||||
search: bool=False,
|
||||
reset: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> None:
|
||||
"""Get and set your ArchiveBox project configuration values"""
|
||||
|
||||
from rich import print
|
||||
|
||||
check_data_folder()
|
||||
if config_options and config_options_str:
|
||||
stderr(
|
||||
'[X] You should either pass config values as an arguments '
|
||||
'or via stdin, but not both.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
elif config_options_str:
|
||||
config_options = config_options_str.split('\n')
|
||||
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
config_options = config_options or []
|
||||
|
||||
no_args = not (get or set or reset or config_options)
|
||||
|
||||
matching_config = {}
|
||||
if search:
|
||||
if config_options:
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
for config_section in CONFIGS.values():
|
||||
aliases = config_section.aliases
|
||||
|
||||
for search_key in config_options:
|
||||
# search all aliases in the section
|
||||
for alias_key, key in aliases.items():
|
||||
if search_key.lower() in alias_key.lower():
|
||||
matching_config[key] = config_section.model_dump()[key]
|
||||
|
||||
# search all keys and values in the section
|
||||
for existing_key, value in config_section.model_dump().items():
|
||||
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
|
||||
matching_config[existing_key] = value
|
||||
|
||||
print(printable_config(matching_config))
|
||||
raise SystemExit(not matching_config)
|
||||
elif get or no_args:
|
||||
if config_options:
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
|
||||
if failed_config:
|
||||
stderr()
|
||||
stderr('[X] These options failed to get', color='red')
|
||||
stderr(' {}'.format('\n '.join(config_options)))
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = FLAT_CONFIG
|
||||
|
||||
print(printable_config(matching_config))
|
||||
raise SystemExit(not matching_config)
|
||||
elif set:
|
||||
new_config = {}
|
||||
failed_options = []
|
||||
for line in config_options:
|
||||
if line.startswith('#') or not line.strip():
|
||||
continue
|
||||
if '=' not in line:
|
||||
stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
|
||||
stderr(f' {line}')
|
||||
raise SystemExit(2)
|
||||
|
||||
raw_key, val = line.split('=', 1)
|
||||
raw_key = raw_key.upper().strip()
|
||||
key = get_real_name(raw_key)
|
||||
if key != raw_key:
|
||||
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
|
||||
|
||||
if key in FLAT_CONFIG:
|
||||
new_config[key] = val.strip()
|
||||
else:
|
||||
failed_options.append(line)
|
||||
|
||||
if new_config:
|
||||
before = FLAT_CONFIG
|
||||
matching_config = write_config_file(new_config)
|
||||
after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
|
||||
print(printable_config(matching_config))
|
||||
|
||||
side_effect_changes = {}
|
||||
for key, val in after.items():
|
||||
if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
|
||||
side_effect_changes[key] = after[key]
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
if side_effect_changes:
|
||||
stderr()
|
||||
stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
|
||||
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
|
||||
if failed_options:
|
||||
stderr()
|
||||
stderr('[X] These options failed to set (check for typos):', color='red')
|
||||
stderr(' {}'.format('\n '.join(failed_options)))
|
||||
raise SystemExit(1)
|
||||
elif reset:
|
||||
stderr('[X] This command is not implemented yet.', color='red')
|
||||
stderr(' Please manually remove the relevant lines from your config file:')
|
||||
raise SystemExit(2)
|
||||
else:
|
||||
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
|
||||
stderr(' archivebox config')
|
||||
stderr(' archivebox config --get SOME_KEY')
|
||||
stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
|
||||
|
||||
@docstring(config.__doc__)
|
||||
|
|
|
@ -1,32 +1,105 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox help'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
from archivebox.config import DATA_DIR
|
||||
from ..main import help
|
||||
import click
|
||||
from rich import print
|
||||
from rich.panel import Panel
|
||||
|
||||
|
||||
@docstring(help.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=help.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
def help() -> None:
|
||||
"""Print the ArchiveBox help message and usage"""
|
||||
|
||||
from archivebox.cli import ArchiveBoxGroup
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
from archivebox.misc.logging_util import log_cli_command
|
||||
|
||||
log_cli_command('help', [], None, '.')
|
||||
|
||||
COMMANDS_HELP_TEXT = '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||
)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
help(out_dir=Path(pwd) if pwd else DATA_DIR)
|
||||
DOCKER_USAGE = '''
|
||||
[dodger_blue3]Docker Usage:[/dodger_blue3]
|
||||
[grey53]# using Docker Compose:[/grey53]
|
||||
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
[grey53]# using Docker:[/grey53]
|
||||
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
''' if IN_DOCKER else ''
|
||||
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
|
||||
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
|
||||
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
|
||||
|
||||
print(f'''{DOCKER_USAGE}
|
||||
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
|
||||
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
[deep_sky_blue4]Commands:[/deep_sky_blue4]
|
||||
{COMMANDS_HELP_TEXT}
|
||||
|
||||
[deep_sky_blue4]Documentation:[/deep_sky_blue4]
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
|
||||
''')
|
||||
|
||||
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
|
||||
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
|
||||
EXAMPLE_USAGE = f'''
|
||||
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
|
||||
|
||||
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
|
||||
[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# make sure database is up-to-date (safe to run multiple times)[/grey53]
|
||||
[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53]
|
||||
[dark_green]archivebox[/dark_green] [green]status[/green] [grey53]# get a health checkup report on your collection[/grey53]
|
||||
[dark_green]archivebox[/dark_green] [green]update[/green] [grey53]# retry any previously failed or interrupted archiving tasks[/grey53]
|
||||
|
||||
[violet]Hint:[/violet] [i]More example usage:[/i]
|
||||
[dark_green]archivebox[/dark_green] [green]add[/green] --depth=1 "https://example.com/some/page"
|
||||
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
|
||||
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
|
||||
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
|
||||
'''
|
||||
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
|
||||
else:
|
||||
DATA_SETUP_HELP = '\n'
|
||||
if IN_DOCKER:
|
||||
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
|
||||
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
|
||||
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
|
||||
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
|
||||
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
def main(**kwargs):
|
||||
"""Print the ArchiveBox help message and usage"""
|
||||
return help()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
main()
|
||||
|
|
|
@ -5,13 +5,193 @@ __command__ = 'archivebox init'
|
|||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, IO
|
||||
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
from ..main import init
|
||||
|
||||
|
||||
def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Path=DATA_DIR) -> None:
|
||||
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||
|
||||
from core.models import Snapshot
|
||||
from rich import print
|
||||
|
||||
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
|
||||
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
|
||||
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
|
||||
|
||||
is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
|
||||
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
|
||||
if is_empty and not existing_index:
|
||||
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
elif existing_index:
|
||||
# TODO: properly detect and print the existing version in current index as well
|
||||
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
else:
|
||||
if force:
|
||||
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
|
||||
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
|
||||
else:
|
||||
print(
|
||||
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||
" You must run init in a completely empty directory, or an existing data folder.\n\n"
|
||||
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
|
||||
" then run and run 'archivebox init' to pick up where you left off.\n\n"
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
|
||||
)
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
if existing_index:
|
||||
print('\n[green][*] Verifying archive folder structure...[/green]')
|
||||
else:
|
||||
print('\n[green][+] Building archive folder structure...[/green]')
|
||||
|
||||
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
|
||||
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
|
||||
|
||||
# create the .archivebox_id file with a unique ID for this collection
|
||||
from archivebox.config.paths import _get_collection_id
|
||||
_get_collection_id(CONSTANTS.DATA_DIR, force_create=True)
|
||||
|
||||
# create the ArchiveBox.conf file
|
||||
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
|
||||
|
||||
|
||||
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
|
||||
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
|
||||
else:
|
||||
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
|
||||
|
||||
for migration_line in apply_migrations(out_dir):
|
||||
sys.stdout.write(f' {migration_line}\n')
|
||||
|
||||
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
|
||||
print()
|
||||
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
|
||||
|
||||
# from django.contrib.auth.models import User
|
||||
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
# call_command("createsuperuser", interactive=True)
|
||||
|
||||
print()
|
||||
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
|
||||
|
||||
all_links = Snapshot.objects.none()
|
||||
pending_links: Dict[str, Link] = {}
|
||||
|
||||
if existing_index:
|
||||
all_links = load_main_index(out_dir=out_dir, warn=False)
|
||||
print(f' √ Loaded {all_links.count()} links from existing main index.')
|
||||
|
||||
if quick:
|
||||
print(' > Skipping full snapshot directory check (quick mode)')
|
||||
else:
|
||||
try:
|
||||
# Links in data folders that dont match their timestamp
|
||||
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
||||
if fixed:
|
||||
print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]')
|
||||
if cant_fix:
|
||||
print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]')
|
||||
|
||||
# Links in JSON index but not in main index
|
||||
orphaned_json_links = {
|
||||
link.url: link
|
||||
for link in parse_json_main_index(out_dir)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
}
|
||||
if orphaned_json_links:
|
||||
pending_links.update(orphaned_json_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
|
||||
|
||||
# Links in data dir indexes but not in main index
|
||||
orphaned_data_dir_links = {
|
||||
link.url: link
|
||||
for link in parse_json_links_details(out_dir)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
}
|
||||
if orphaned_data_dir_links:
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
|
||||
|
||||
# Links in invalid/duplicate data dirs
|
||||
invalid_folders = {
|
||||
folder: link
|
||||
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
|
||||
}
|
||||
if invalid_folders:
|
||||
print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]')
|
||||
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items()))
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:')
|
||||
print(' archivebox status')
|
||||
print(' archivebox list --status=invalid')
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
print(file=sys.stderr)
|
||||
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
|
||||
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
|
||||
print(' archivebox init --quick', file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
||||
|
||||
print('\n[green]----------------------------------------------------------------------[/green]')
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
|
||||
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||
|
||||
if existing_index:
|
||||
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
|
||||
else:
|
||||
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
|
||||
|
||||
json_index = out_dir / CONSTANTS.JSON_INDEX_FILENAME
|
||||
html_index = out_dir / CONSTANTS.HTML_INDEX_FILENAME
|
||||
index_name = f"{date.today()}_index_old"
|
||||
if os.access(json_index, os.F_OK):
|
||||
json_index.rename(f"{index_name}.json")
|
||||
if os.access(html_index, os.F_OK):
|
||||
html_index.rename(f"{index_name}.html")
|
||||
|
||||
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if install:
|
||||
run_subcommand('install', pwd=out_dir)
|
||||
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To view your archive index, run:')
|
||||
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" archivebox add < ~/some/path/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
|
||||
|
||||
@docstring(init.__doc__)
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox install'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
@ -11,11 +12,145 @@ from typing import Optional, List, IO
|
|||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
from ..main import install
|
||||
|
||||
|
||||
def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
|
||||
"""Automatically install all ArchiveBox dependencies and extras"""
|
||||
|
||||
# if running as root:
|
||||
# - run init to create index + lib dir
|
||||
# - chown -R 911 DATA_DIR
|
||||
# - install all binaries as root
|
||||
# - chown -R 911 LIB_DIR
|
||||
# else:
|
||||
# - run init to create index + lib dir as current user
|
||||
# - install all binaries as current user
|
||||
# - recommend user re-run with sudo if any deps need to be installed as root
|
||||
|
||||
from rich import print
|
||||
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.config.paths import get_or_create_working_lib_dir
|
||||
|
||||
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
|
||||
run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in
|
||||
|
||||
print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
|
||||
|
||||
# we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
|
||||
if IS_ROOT:
|
||||
EUID = os.geteuid()
|
||||
|
||||
# if we have sudo/root permissions, take advantage of them just while installing dependencies
|
||||
print()
|
||||
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
|
||||
print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
|
||||
print()
|
||||
|
||||
LIB_DIR = get_or_create_working_lib_dir()
|
||||
|
||||
package_manager_names = ', '.join(
|
||||
f'[yellow]{binprovider.name}[/yellow]'
|
||||
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
|
||||
if not binproviders or (binproviders and binprovider.name in binproviders)
|
||||
)
|
||||
print(f'[+] Setting up package managers {package_manager_names}...')
|
||||
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
|
||||
if binproviders and binprovider.name not in binproviders:
|
||||
continue
|
||||
try:
|
||||
binprovider.setup()
|
||||
except Exception:
|
||||
# it's ok, installing binaries below will automatically set up package managers as needed
|
||||
# e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
|
||||
# the next package that depends on npm will automatically call binprovider.setup() during its own install
|
||||
pass
|
||||
|
||||
print()
|
||||
|
||||
for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
|
||||
if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
|
||||
# obviously must already be installed if we are running
|
||||
continue
|
||||
|
||||
if binaries and binary.name not in binaries:
|
||||
continue
|
||||
|
||||
providers = ' [grey53]or[/grey53] '.join(
|
||||
provider.name for provider in binary.binproviders_supported
|
||||
if not binproviders or (binproviders and provider.name in binproviders)
|
||||
)
|
||||
if not providers:
|
||||
continue
|
||||
print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
|
||||
try:
|
||||
with SudoPermission(uid=0, fallback=True):
|
||||
# print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
|
||||
if binproviders:
|
||||
providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
|
||||
for binprovider_name in binproviders:
|
||||
if binprovider_name not in providers_supported_by_binary:
|
||||
continue
|
||||
try:
|
||||
if dry_run:
|
||||
# always show install commands when doing a dry run
|
||||
sys.stderr.write("\033[2;49;90m") # grey53
|
||||
result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
sys.stderr.write("\033[00m\n") # reset
|
||||
else:
|
||||
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
|
||||
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
if result and result['loaded_version']:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
|
||||
else:
|
||||
if dry_run:
|
||||
sys.stderr.write("\033[2;49;90m") # grey53
|
||||
binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
sys.stderr.write("\033[00m\n") # reset
|
||||
else:
|
||||
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
|
||||
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
if IS_ROOT and LIB_DIR:
|
||||
with SudoPermission(uid=0):
|
||||
if ARCHIVEBOX_USER == 0:
|
||||
os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
|
||||
else:
|
||||
os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
|
||||
except Exception as e:
|
||||
print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
|
||||
if binaries and len(binaries) == 1:
|
||||
# if we are only installing a single binary, raise the exception so the user can see what went wrong
|
||||
raise
|
||||
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||
stderr(' archivebox manage createsuperuser')
|
||||
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||
|
||||
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
|
||||
|
||||
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||
|
||||
extra_args = []
|
||||
if binproviders:
|
||||
extra_args.append(f'--binproviders={",".join(binproviders)}')
|
||||
if binaries:
|
||||
extra_args.append(f'--binaries={",".join(binaries)}')
|
||||
|
||||
proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=out_dir)
|
||||
raise SystemExit(proc.returncode)
|
||||
|
||||
|
||||
|
||||
@docstring(install.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=install.__doc__,
|
||||
|
|
|
@ -1,139 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox list'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin, stderr
|
||||
from ..main import list_all
|
||||
from ..index import (
|
||||
LINK_FILTERS,
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_invalid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
|
||||
@docstring(list_all.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=list_all.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--csv', #'-c',
|
||||
type=str,
|
||||
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
|
||||
default=None,
|
||||
)
|
||||
group.add_argument(
|
||||
'--json', #'-j',
|
||||
action='store_true',
|
||||
help="Print the output in JSON format with all columns included",
|
||||
)
|
||||
group.add_argument(
|
||||
'--html',
|
||||
action='store_true',
|
||||
help="Print the output in HTML format"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--with-headers',
|
||||
action='store_true',
|
||||
help='Include the headers in the output document'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--sort', #'-s',
|
||||
type=str,
|
||||
help="List the links sorted using the given key, e.g. timestamp or updated",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="List only links bookmarked before (less than) the given timestamp",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="List only links bookmarked after (greater than or equal to) the given timestamp",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--status',
|
||||
type=str,
|
||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
||||
default='indexed',
|
||||
help=(
|
||||
'List only links or data directories that have the given status\n'
|
||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
||||
f' archived {get_archived_folders.__doc__}\n'
|
||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' present {get_present_folders.__doc__}\n'
|
||||
f' valid {get_valid_folders.__doc__}\n'
|
||||
f' invalid {get_invalid_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type', '-t',
|
||||
type=str,
|
||||
choices=(*LINK_FILTERS.keys(), 'search'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='List only URLs matching these filter patterns'
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(stdin)
|
||||
|
||||
if command.with_headers and not (command.json or command.html or command.csv):
|
||||
stderr(
|
||||
'[X] --with-headers can only be used with --json, --html or --csv options\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
matching_folders = list_all(
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
status=command.status,
|
||||
after=command.after,
|
||||
before=command.before,
|
||||
sort=command.sort,
|
||||
csv=command.csv,
|
||||
json=command.json,
|
||||
html=command.html,
|
||||
with_headers=command.with_headers,
|
||||
out_dir=Path(pwd) if pwd else DATA_DIR,
|
||||
)
|
||||
raise SystemExit(not matching_folders)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
|
@ -9,7 +9,27 @@ from typing import Optional, List, IO
|
|||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from ..main import manage
|
||||
|
||||
|
||||
|
||||
# @enforce_types
|
||||
def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
|
||||
"""Run an ArchiveBox Django management command"""
|
||||
|
||||
check_data_folder()
|
||||
from django.core.management import execute_from_command_line
|
||||
|
||||
if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY):
|
||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr('')
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
execute_from_command_line(['manage.py', *(args or ['help'])])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@docstring(manage.__doc__)
|
||||
|
|
|
@ -1,73 +1,98 @@
|
|||
#!/usr/bin/env python3
|
||||
# #!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox oneshot'
|
||||
################## DEPRECATED IN FAVOR OF abx-dl #####################
|
||||
# https://github.com/ArchiveBox/abx-dl
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
# __package__ = 'archivebox.cli'
|
||||
# __command__ = 'archivebox oneshot'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, IO
|
||||
# import sys
|
||||
# import argparse
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
|
||||
from ..main import oneshot
|
||||
# from pathlib import Path
|
||||
# from typing import List, Optional, IO
|
||||
|
||||
# from archivebox.misc.util import docstring
|
||||
# from archivebox.config import DATA_DIR
|
||||
# from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
|
||||
|
||||
|
||||
@docstring(oneshot.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=oneshot.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'url',
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'URLs or paths to archive e.g.:\n'
|
||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||
' https://example.com/some/rss/feed.xml\n'
|
||||
' https://example.com\n'
|
||||
' ~/Downloads/firefox_bookmarks_export.html\n'
|
||||
' ~/Desktop/sites_list.csv\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract",
|
||||
type=str,
|
||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||
This does not take precedence over the configuration",
|
||||
default=""
|
||||
)
|
||||
parser.add_argument(
|
||||
'--out-dir',
|
||||
type=str,
|
||||
default=DATA_DIR,
|
||||
help= "Path to save the single archive folder to, e.g. ./example.com_archive"
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
stdin_url = None
|
||||
url = command.url
|
||||
if not url:
|
||||
stdin_url = accept_stdin(stdin)
|
||||
# @enforce_types
|
||||
# def oneshot(url: str, extractors: str="", out_dir: Path=DATA_DIR, created_by_id: int | None=None) -> List[Link]:
|
||||
# """
|
||||
# Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
|
||||
# You can run this to archive single pages without needing to create a whole collection with archivebox init.
|
||||
# """
|
||||
# oneshot_link, _ = parse_links_memory([url])
|
||||
# if len(oneshot_link) > 1:
|
||||
# stderr(
|
||||
# '[X] You should pass a single url to the oneshot command',
|
||||
# color='red'
|
||||
# )
|
||||
# raise SystemExit(2)
|
||||
|
||||
if (stdin_url and url) or (not stdin and not url):
|
||||
stderr(
|
||||
'[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
oneshot(
|
||||
url=stdin_url or url,
|
||||
out_dir=Path(command.out_dir).resolve(),
|
||||
extractors=command.extract,
|
||||
)
|
||||
# methods = extractors.split(",") if extractors else ignore_methods(['title'])
|
||||
# archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, created_by_id=created_by_id)
|
||||
# return oneshot_link
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
||||
|
||||
|
||||
|
||||
# @docstring(oneshot.__doc__)
|
||||
# def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
# parser = argparse.ArgumentParser(
|
||||
# prog=__command__,
|
||||
# description=oneshot.__doc__,
|
||||
# add_help=True,
|
||||
# formatter_class=SmartFormatter,
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# 'url',
|
||||
# type=str,
|
||||
# default=None,
|
||||
# help=(
|
||||
# 'URLs or paths to archive e.g.:\n'
|
||||
# ' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||
# ' https://example.com/some/rss/feed.xml\n'
|
||||
# ' https://example.com\n'
|
||||
# ' ~/Downloads/firefox_bookmarks_export.html\n'
|
||||
# ' ~/Desktop/sites_list.csv\n'
|
||||
# )
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# "--extract",
|
||||
# type=str,
|
||||
# help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||
# This does not take precedence over the configuration",
|
||||
# default=""
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# '--out-dir',
|
||||
# type=str,
|
||||
# default=DATA_DIR,
|
||||
# help= "Path to save the single archive folder to, e.g. ./example.com_archive"
|
||||
# )
|
||||
# command = parser.parse_args(args or ())
|
||||
# stdin_url = None
|
||||
# url = command.url
|
||||
# if not url:
|
||||
# stdin_url = accept_stdin(stdin)
|
||||
|
||||
# if (stdin_url and url) or (not stdin and not url):
|
||||
# stderr(
|
||||
# '[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
|
||||
# color='red',
|
||||
# )
|
||||
# raise SystemExit(2)
|
||||
|
||||
# oneshot(
|
||||
# url=stdin_url or url,
|
||||
# out_dir=Path(command.out_dir).resolve(),
|
||||
# extractors=command.extract,
|
||||
# )
|
||||
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -8,10 +8,93 @@ import argparse
|
|||
from pathlib import Path
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||
from ..main import remove
|
||||
from archivebox.index.schema import Link
|
||||
|
||||
|
||||
def remove(filter_str: Optional[str]=None,
|
||||
filter_patterns: Optional[list[str]]=None,
|
||||
filter_type: str='exact',
|
||||
snapshots: Optional[QuerySet]=None,
|
||||
after: Optional[float]=None,
|
||||
before: Optional[float]=None,
|
||||
yes: bool=False,
|
||||
delete: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> list[Link]:
|
||||
"""Remove the specified URLs from the archive"""
|
||||
|
||||
check_data_folder()
|
||||
|
||||
if snapshots is None:
|
||||
if filter_str and filter_patterns:
|
||||
stderr(
|
||||
'[X] You should pass either a pattern as an argument, '
|
||||
'or pass a list of patterns via stdin, but not both.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
elif not (filter_str or filter_patterns):
|
||||
stderr(
|
||||
'[X] You should pass either a pattern as an argument, '
|
||||
'or pass a list of patterns via stdin.',
|
||||
color='red',
|
||||
)
|
||||
stderr()
|
||||
hint(('To remove all urls you can run:',
|
||||
'archivebox remove --filter-type=regex ".*"'))
|
||||
stderr()
|
||||
raise SystemExit(2)
|
||||
elif filter_str:
|
||||
filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
|
||||
|
||||
list_kwargs = {
|
||||
"filter_patterns": filter_patterns,
|
||||
"filter_type": filter_type,
|
||||
"after": after,
|
||||
"before": before,
|
||||
}
|
||||
if snapshots:
|
||||
list_kwargs["snapshots"] = snapshots
|
||||
|
||||
log_list_started(filter_patterns, filter_type)
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
snapshots = list_links(**list_kwargs)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
||||
if not snapshots.exists():
|
||||
log_removal_finished(0, 0)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
log_links = [link.as_link() for link in snapshots]
|
||||
log_list_finished(log_links)
|
||||
log_removal_started(log_links, yes=yes, delete=delete)
|
||||
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
for snapshot in snapshots:
|
||||
if delete:
|
||||
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
to_remove = snapshots.count()
|
||||
|
||||
from .search import flush_search_index
|
||||
|
||||
flush_search_index(snapshots=snapshots)
|
||||
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
||||
all_snapshots = load_main_index(out_dir=out_dir)
|
||||
log_removal_finished(all_snapshots.count(), to_remove)
|
||||
|
||||
return all_snapshots
|
||||
|
||||
|
||||
@docstring(remove.__doc__)
|
||||
|
|
|
@ -11,7 +11,139 @@ from typing import Optional, List, IO
|
|||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
from ..main import schedule
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
# @enforce_types
|
||||
def schedule(add: bool=False,
|
||||
show: bool=False,
|
||||
clear: bool=False,
|
||||
foreground: bool=False,
|
||||
run_all: bool=False,
|
||||
quiet: bool=False,
|
||||
every: Optional[str]=None,
|
||||
tag: str='',
|
||||
depth: int=0,
|
||||
overwrite: bool=False,
|
||||
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
import_path: Optional[str]=None,
|
||||
out_dir: Path=DATA_DIR):
|
||||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||
|
||||
check_data_folder()
|
||||
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||
from archivebox.config.permissions import USER
|
||||
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
cron = CronTab(user=True)
|
||||
cron = dedupe_cron_jobs(cron)
|
||||
|
||||
if clear:
|
||||
print(cron.remove_all(comment=CRON_COMMENT))
|
||||
cron.write()
|
||||
raise SystemExit(0)
|
||||
|
||||
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||
|
||||
if every or add:
|
||||
every = every or 'day'
|
||||
quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s)
|
||||
cmd = [
|
||||
'cd',
|
||||
quoted(out_dir),
|
||||
'&&',
|
||||
quoted(ARCHIVEBOX_BINARY.load().abspath),
|
||||
*([
|
||||
'add',
|
||||
*(['--overwrite'] if overwrite else []),
|
||||
*(['--update'] if update else []),
|
||||
*([f'--tag={tag}'] if tag else []),
|
||||
f'--depth={depth}',
|
||||
f'"{import_path}"',
|
||||
] if import_path else ['update']),
|
||||
'>>',
|
||||
quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
|
||||
'2>&1',
|
||||
|
||||
]
|
||||
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
|
||||
|
||||
if every in ('minute', 'hour', 'day', 'month', 'year'):
|
||||
set_every = getattr(new_job.every(), every)
|
||||
set_every()
|
||||
elif CronSlices.is_valid(every):
|
||||
new_job.setall(every)
|
||||
else:
|
||||
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
stderr(' It must be one of minute/hour/day/month')
|
||||
stderr(' or a quoted cron-format schedule like:')
|
||||
stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
|
||||
stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml')
|
||||
raise SystemExit(1)
|
||||
|
||||
cron = dedupe_cron_jobs(cron)
|
||||
cron.write()
|
||||
|
||||
total_runs = sum(j.frequency_per_year() for j in cron)
|
||||
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||
|
||||
print()
|
||||
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
|
||||
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
|
||||
if total_runs > 60 and not quiet:
|
||||
stderr()
|
||||
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
|
||||
stderr(' Congrats on being an enthusiastic internet archiver! 👌')
|
||||
stderr()
|
||||
stderr(' Make sure you have enough storage space available to hold all the data.')
|
||||
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
|
||||
stderr('')
|
||||
elif show:
|
||||
if existing_jobs:
|
||||
print('\n'.join(str(cmd) for cmd in existing_jobs))
|
||||
else:
|
||||
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
|
||||
stderr(' To schedule a new job, run:')
|
||||
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
|
||||
raise SystemExit(0)
|
||||
|
||||
cron = CronTab(user=True)
|
||||
cron = dedupe_cron_jobs(cron)
|
||||
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||
|
||||
if foreground or run_all:
|
||||
if not existing_jobs:
|
||||
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
|
||||
raise SystemExit(1)
|
||||
|
||||
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
|
||||
if run_all:
|
||||
try:
|
||||
for job in existing_jobs:
|
||||
sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n')
|
||||
sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
|
||||
sys.stdout.flush()
|
||||
job.run()
|
||||
sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n')
|
||||
except KeyboardInterrupt:
|
||||
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
raise SystemExit(1)
|
||||
|
||||
if foreground:
|
||||
try:
|
||||
for job in existing_jobs:
|
||||
print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
|
||||
for result in cron.run_scheduler():
|
||||
print(result)
|
||||
except KeyboardInterrupt:
|
||||
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
raise SystemExit(1)
|
||||
|
||||
# if CAN_UPGRADE:
|
||||
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
|
||||
|
||||
@docstring(schedule.__doc__)
|
||||
|
|
164
archivebox/cli/archivebox_search.py
Normal file
164
archivebox/cli/archivebox_search.py
Normal file
|
@ -0,0 +1,164 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.index import LINK_FILTERS
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.misc.logging import stderr
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
|
||||
STATUS_CHOICES = [
|
||||
'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
|
||||
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||
]
|
||||
|
||||
|
||||
|
||||
def list_links(snapshots: Optional[QuerySet]=None,
|
||||
filter_patterns: Optional[List[str]]=None,
|
||||
filter_type: str='substring',
|
||||
after: Optional[float]=None,
|
||||
before: Optional[float]=None,
|
||||
out_dir: Path=DATA_DIR) -> Iterable[Link]:
|
||||
|
||||
from archivebox.index import load_main_index
|
||||
from archivebox.index import snapshot_filter
|
||||
|
||||
if snapshots:
|
||||
all_snapshots = snapshots
|
||||
else:
|
||||
all_snapshots = load_main_index(out_dir=out_dir)
|
||||
|
||||
if after is not None:
|
||||
all_snapshots = all_snapshots.filter(timestamp__gte=after)
|
||||
if before is not None:
|
||||
all_snapshots = all_snapshots.filter(timestamp__lt=before)
|
||||
if filter_patterns:
|
||||
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
|
||||
|
||||
if not all_snapshots:
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
|
||||
return all_snapshots
|
||||
|
||||
|
||||
def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.index import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_invalid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
check_data_folder()
|
||||
|
||||
STATUS_FUNCTIONS = {
|
||||
"indexed": get_indexed_folders,
|
||||
"archived": get_archived_folders,
|
||||
"unarchived": get_unarchived_folders,
|
||||
"present": get_present_folders,
|
||||
"valid": get_valid_folders,
|
||||
"invalid": get_invalid_folders,
|
||||
"duplicate": get_duplicate_folders,
|
||||
"orphaned": get_orphaned_folders,
|
||||
"corrupted": get_corrupted_folders,
|
||||
"unrecognized": get_unrecognized_folders,
|
||||
}
|
||||
|
||||
try:
|
||||
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
|
||||
except KeyError:
|
||||
raise ValueError('Status not recognized.')
|
||||
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def search(filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
status: str='indexed',
|
||||
before: float | None=None,
|
||||
after: float | None=None,
|
||||
sort: str | None=None,
|
||||
json: bool=False,
|
||||
html: bool=False,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
raise SystemExit(2)
|
||||
|
||||
snapshots = list_links(
|
||||
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
)
|
||||
|
||||
if sort:
|
||||
snapshots = snapshots.order_by(sort)
|
||||
|
||||
folders = list_folders(
|
||||
links=snapshots,
|
||||
status=status,
|
||||
out_dir=DATA_DIR,
|
||||
)
|
||||
|
||||
if json:
|
||||
from archivebox.index.json import generate_json_index_from_links
|
||||
output = generate_json_index_from_links(folders.values(), with_headers)
|
||||
elif html:
|
||||
from archivebox.index.html import generate_index_from_links
|
||||
output = generate_index_from_links(folders.values(), with_headers)
|
||||
elif csv:
|
||||
from archivebox.index.csv import links_to_csv
|
||||
output = links_to_csv(folders.values(), csv.split(','), with_headers)
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
output = printable_folders(folders, with_headers)
|
||||
|
||||
print(output)
|
||||
return output
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
|
||||
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
|
||||
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
|
||||
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
|
||||
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
|
||||
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
|
||||
@click.help_option('--help', '-h')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@docstring(search.__doc__)
|
||||
def main(**kwargs):
|
||||
return search(**kwargs)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -12,7 +12,81 @@ from archivebox.misc.util import docstring
|
|||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
from ..main import server
|
||||
|
||||
|
||||
|
||||
# @enforce_types
|
||||
def server(runserver_args: Optional[List[str]]=None,
|
||||
reload: bool=False,
|
||||
debug: bool=False,
|
||||
init: bool=False,
|
||||
quick_init: bool=False,
|
||||
createsuperuser: bool=False,
|
||||
daemonize: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> None:
|
||||
"""Run the ArchiveBox HTTP server"""
|
||||
|
||||
from rich import print
|
||||
|
||||
runserver_args = runserver_args or []
|
||||
|
||||
if init:
|
||||
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
print()
|
||||
elif quick_init:
|
||||
run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
|
||||
print()
|
||||
|
||||
if createsuperuser:
|
||||
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||
print()
|
||||
|
||||
|
||||
check_data_folder()
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
print()
|
||||
# print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]')
|
||||
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print()
|
||||
|
||||
|
||||
host = '127.0.0.1'
|
||||
port = '8000'
|
||||
|
||||
try:
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
|
||||
if ':' in host_and_port:
|
||||
host, port = host_and_port.split(':')
|
||||
else:
|
||||
if '.' in host_and_port:
|
||||
host = host_and_port
|
||||
else:
|
||||
port = host_and_port
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
|
||||
if SHELL_CONFIG.DEBUG:
|
||||
if not reload:
|
||||
runserver_args.append('--noreload') # '--insecure'
|
||||
call_command("runserver", *runserver_args)
|
||||
else:
|
||||
from workers.supervisord_util import start_server_workers
|
||||
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=False)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
|
||||
|
||||
@docstring(server.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
|
|
|
@ -11,7 +11,19 @@ from typing import Optional, List, IO
|
|||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
from ..main import shell
|
||||
|
||||
|
||||
|
||||
#@enforce_types
|
||||
def shell(out_dir: Path=DATA_DIR) -> None:
|
||||
"""Enter an interactive ArchiveBox Django shell"""
|
||||
|
||||
check_data_folder()
|
||||
|
||||
from django.core.management import call_command
|
||||
call_command("shell_plus")
|
||||
|
||||
|
||||
|
||||
|
||||
@docstring(shell.__doc__)
|
||||
|
|
|
@ -8,10 +8,114 @@ import argparse
|
|||
from pathlib import Path
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from rich import print
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
from ..main import status
|
||||
|
||||
|
||||
|
||||
|
||||
# @enforce_types
|
||||
def status(out_dir: Path=DATA_DIR) -> None:
|
||||
"""Print out some info and statistics about the archive collection"""
|
||||
|
||||
check_data_folder()
|
||||
|
||||
from core.models import Snapshot
|
||||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
|
||||
print('{green}[*] Scanning archive main index...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
print(SHELL_CONFIG.ANSI['lightyellow'], f' {out_dir}/*', SHELL_CONFIG.ANSI['reset'])
|
||||
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Index size: {size} across {num_files} files')
|
||||
print()
|
||||
|
||||
links = load_main_index(out_dir=out_dir)
|
||||
num_sql_links = links.count()
|
||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
|
||||
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
|
||||
print()
|
||||
print('{green}[*] Scanning archive data directories...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
print(SHELL_CONFIG.ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', SHELL_CONFIG.ANSI['reset'])
|
||||
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
print(SHELL_CONFIG.ANSI['black'])
|
||||
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
|
||||
num_archived = len(get_archived_folders(links, out_dir=out_dir))
|
||||
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
|
||||
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
|
||||
|
||||
num_present = len(get_present_folders(links, out_dir=out_dir))
|
||||
num_valid = len(get_valid_folders(links, out_dir=out_dir))
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
|
||||
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
|
||||
|
||||
duplicate = get_duplicate_folders(links, out_dir=out_dir)
|
||||
orphaned = get_orphaned_folders(links, out_dir=out_dir)
|
||||
corrupted = get_corrupted_folders(links, out_dir=out_dir)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
|
||||
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
|
||||
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
|
||||
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
|
||||
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
|
||||
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
|
||||
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
|
||||
|
||||
print(SHELL_CONFIG.ANSI['reset'])
|
||||
|
||||
if num_indexed:
|
||||
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**SHELL_CONFIG.ANSI))
|
||||
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
|
||||
|
||||
if orphaned:
|
||||
print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**SHELL_CONFIG.ANSI))
|
||||
print(' archivebox init')
|
||||
|
||||
if num_invalid:
|
||||
print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**SHELL_CONFIG.ANSI))
|
||||
print(' archivebox init')
|
||||
|
||||
print()
|
||||
print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
print(SHELL_CONFIG.ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', SHELL_CONFIG.ANSI['reset'])
|
||||
users = get_admins().values_list('username', flat=True)
|
||||
print(f' UI users {len(users)}: {", ".join(users)}')
|
||||
last_login = User.objects.order_by('last_login').last()
|
||||
if last_login:
|
||||
print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
|
||||
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
|
||||
if last_downloaded:
|
||||
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
|
||||
|
||||
if not users:
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**SHELL_CONFIG.ANSI))
|
||||
print(' archivebox manage createsuperuser')
|
||||
|
||||
print()
|
||||
for snapshot in links.order_by('-downloaded_at')[:10]:
|
||||
if not snapshot.downloaded_at:
|
||||
continue
|
||||
print(
|
||||
SHELL_CONFIG.ANSI['black'],
|
||||
(
|
||||
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||
f'"{snapshot.title}": {snapshot.url}'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||
SHELL_CONFIG.ANSI['reset'],
|
||||
)
|
||||
print(SHELL_CONFIG.ANSI['black'], ' ...', SHELL_CONFIG.ANSI['reset'])
|
||||
|
||||
|
||||
|
||||
@docstring(status.__doc__)
|
||||
|
|
|
@ -24,7 +24,92 @@ from archivebox.index import (
|
|||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||
# from ..main import update
|
||||
|
||||
|
||||
|
||||
|
||||
# LEGACY VERSION:
|
||||
# @enforce_types
|
||||
# def update(resume: Optional[float]=None,
|
||||
# only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
|
||||
# index_only: bool=False,
|
||||
# overwrite: bool=False,
|
||||
# filter_patterns_str: Optional[str]=None,
|
||||
# filter_patterns: Optional[List[str]]=None,
|
||||
# filter_type: Optional[str]=None,
|
||||
# status: Optional[str]=None,
|
||||
# after: Optional[str]=None,
|
||||
# before: Optional[str]=None,
|
||||
# extractors: str="",
|
||||
# out_dir: Path=DATA_DIR) -> List[Link]:
|
||||
# """Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
# from core.models import ArchiveResult
|
||||
# from .search import index_links
|
||||
# # from workers.supervisord_util import start_cli_workers
|
||||
|
||||
|
||||
# check_data_folder()
|
||||
# # start_cli_workers()
|
||||
# new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||
|
||||
# extractors = extractors.split(",") if extractors else []
|
||||
|
||||
# # Step 1: Filter for selected_links
|
||||
# print('[*] Finding matching Snapshots to update...')
|
||||
# print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
|
||||
# matching_snapshots = list_links(
|
||||
# filter_patterns=filter_patterns,
|
||||
# filter_type=filter_type,
|
||||
# before=before,
|
||||
# after=after,
|
||||
# )
|
||||
# print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
|
||||
# matching_folders = list_folders(
|
||||
# links=matching_snapshots,
|
||||
# status=status,
|
||||
# out_dir=out_dir,
|
||||
# )
|
||||
# all_links = (link for link in matching_folders.values() if link)
|
||||
# print(' - Sorting by most unfinished -> least unfinished + date archived...')
|
||||
# all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
|
||||
|
||||
# if index_only:
|
||||
# for link in all_links:
|
||||
# write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||
# index_links(all_links, out_dir=out_dir)
|
||||
# return all_links
|
||||
|
||||
# # Step 2: Run the archive methods for each link
|
||||
# to_archive = new_links if only_new else all_links
|
||||
# if resume:
|
||||
# to_archive = [
|
||||
# link for link in to_archive
|
||||
# if link.timestamp >= str(resume)
|
||||
# ]
|
||||
# if not to_archive:
|
||||
# stderr('')
|
||||
# stderr(f'[√] Nothing found to resume after {resume}', color='green')
|
||||
# return all_links
|
||||
|
||||
# archive_kwargs = {
|
||||
# "out_dir": out_dir,
|
||||
# }
|
||||
# if extractors:
|
||||
# archive_kwargs["methods"] = extractors
|
||||
|
||||
|
||||
# archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||
|
||||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||
# all_links = load_main_index(out_dir=out_dir)
|
||||
# return all_links
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def update():
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
|
|
|
@ -1,61 +1,207 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox version'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, IO
|
||||
from typing import Iterable
|
||||
|
||||
# from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR, VERSION
|
||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
# @docstring(version.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
"""Print the ArchiveBox version and dependency information"""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description="Print the ArchiveBox version and dependency information", # version.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--quiet', '-q',
|
||||
action='store_true',
|
||||
help='Only print ArchiveBox version number and nothing else.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--binproviders', '-p',
|
||||
type=str,
|
||||
help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)',
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--binaries', '-b',
|
||||
type=str,
|
||||
help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)',
|
||||
default=None,
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
@enforce_types
|
||||
def version(quiet: bool=False,
|
||||
binproviders: Iterable[str]=(),
|
||||
binaries: Iterable[str]=()) -> list[str]:
|
||||
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
|
||||
|
||||
# for speed reasons, check if quiet flag was set and just return simple version immediately if so
|
||||
if command.quiet:
|
||||
# fast path for just getting the version and exiting, dont do any slower imports
|
||||
from archivebox.config.version import VERSION
|
||||
print(VERSION)
|
||||
return
|
||||
if quiet or '--version' in sys.argv:
|
||||
return []
|
||||
|
||||
# otherwise do big expensive import to get the full version
|
||||
from ..main import version
|
||||
version(
|
||||
quiet=command.quiet,
|
||||
out_dir=Path(pwd) if pwd else DATA_DIR,
|
||||
binproviders=command.binproviders.split(',') if command.binproviders else None,
|
||||
binaries=command.binaries.split(',') if command.binaries else None,
|
||||
# Only do slower imports when getting full version info
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.console import Console
|
||||
from abx_pkg import Binary
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS, DATA_DIR
|
||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
|
||||
from archivebox.config.paths import get_data_locations, get_code_locations
|
||||
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.misc.logging_util import printable_folder_status
|
||||
|
||||
from abx_plugin_default_binproviders import apt, brew, env
|
||||
|
||||
console = Console()
|
||||
prnt = console.print
|
||||
|
||||
LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
|
||||
|
||||
# 0.7.1
|
||||
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
|
||||
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
|
||||
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
|
||||
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
|
||||
|
||||
p = platform.uname()
|
||||
COMMIT_HASH = get_COMMIT_HASH()
|
||||
prnt(
|
||||
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={get_BUILD_TIME()}',
|
||||
)
|
||||
prnt(
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
|
||||
f'ARCH={p.machine}',
|
||||
f'OS={p.system}',
|
||||
f'PLATFORM={platform.platform()}',
|
||||
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
|
||||
)
|
||||
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
)
|
||||
prnt(
|
||||
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||
f'SUDO={CONSTANTS.IS_ROOT}',
|
||||
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_ENABLED}',
|
||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
)
|
||||
prnt()
|
||||
|
||||
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
|
||||
PANEL_TEXT = '\n'.join((
|
||||
# '',
|
||||
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
|
||||
'',
|
||||
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
|
||||
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
|
||||
'',
|
||||
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
|
||||
'',
|
||||
))
|
||||
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
prnt()
|
||||
return []
|
||||
|
||||
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||
failures = []
|
||||
BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
|
||||
for name, binary in list(BINARIES.items()):
|
||||
if binary.name == 'archivebox':
|
||||
continue
|
||||
|
||||
# skip if the binary is not in the requested list of binaries
|
||||
if binaries and binary.name not in binaries:
|
||||
continue
|
||||
|
||||
# skip if the binary is not supported by any of the requested binproviders
|
||||
if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
|
||||
continue
|
||||
|
||||
err = None
|
||||
try:
|
||||
loaded_bin = binary.load()
|
||||
except Exception as e:
|
||||
err = e
|
||||
loaded_bin = binary
|
||||
provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
|
||||
if loaded_bin.abspath:
|
||||
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||
if ' ' in abspath:
|
||||
abspath = abspath.replace(' ', r'\ ')
|
||||
else:
|
||||
abspath = f'[red]{err}[/red]'
|
||||
prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
|
||||
if not loaded_bin.is_valid:
|
||||
failures.append(loaded_bin.name)
|
||||
|
||||
prnt()
|
||||
prnt('[gold3][i] Package Managers:[/gold3]')
|
||||
BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
|
||||
for name, binprovider in list(BINPROVIDERS.items()):
|
||||
err = None
|
||||
|
||||
if binproviders and binprovider.name not in binproviders:
|
||||
continue
|
||||
|
||||
# TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
|
||||
loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
|
||||
|
||||
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||
abspath = None
|
||||
if loaded_bin.abspath:
|
||||
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
if ' ' in abspath:
|
||||
abspath = abspath.replace(' ', r'\ ')
|
||||
|
||||
PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||
ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
|
||||
provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
|
||||
prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)
|
||||
|
||||
if not (binaries or binproviders):
|
||||
# dont show source code / data dir info if we just want to get version info for a binary or binprovider
|
||||
|
||||
prnt()
|
||||
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
|
||||
for name, path in get_code_locations().items():
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
|
||||
prnt()
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
|
||||
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
|
||||
for name, path in get_data_locations().items():
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
|
||||
from archivebox.misc.checks import check_data_dir_permissions
|
||||
|
||||
check_data_dir_permissions()
|
||||
else:
|
||||
prnt()
|
||||
prnt('[red][i] Data locations:[/red] (not in a data directory)')
|
||||
|
||||
prnt()
|
||||
|
||||
if failures:
|
||||
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
|
||||
prnt(f' [red]{", ".join(failures)}[/red]')
|
||||
prnt()
|
||||
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
|
||||
prnt(' [green]archivebox install[/green]')
|
||||
prnt()
|
||||
return failures
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
|
||||
@click.option('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
|
||||
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
|
||||
@docstring(version.__doc__)
|
||||
def main(**kwargs):
|
||||
failures = version(**kwargs)
|
||||
if failures:
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
main()
|
||||
|
|
|
@ -60,7 +60,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
|||
return
|
||||
|
||||
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=False)
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
|
||||
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
||||
|
||||
|
|
|
@ -142,7 +142,7 @@ def create_and_chown_dir(dir_path: Path) -> None:
|
|||
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
|
||||
|
||||
@cache
|
||||
def get_or_create_working_tmp_dir(autofix=True, quiet=False):
|
||||
def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.misc.checks import check_tmp_dir
|
||||
|
@ -165,7 +165,7 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=False):
|
|||
pass
|
||||
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
|
||||
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
|
||||
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet)
|
||||
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
|
||||
return candidate
|
||||
|
||||
if not quiet:
|
||||
|
@ -193,7 +193,7 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
|||
pass
|
||||
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
|
||||
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
|
||||
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet)
|
||||
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
|
||||
return candidate
|
||||
|
||||
if not quiet:
|
||||
|
|
|
@ -36,6 +36,8 @@ HOSTNAME: str = max([socket.gethostname(), platform.node()], key=len)
|
|||
|
||||
IS_ROOT = RUNNING_AS_UID == 0
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose
|
||||
|
||||
|
||||
FALLBACK_UID = RUNNING_AS_UID or SUDO_UID
|
||||
FALLBACK_GID = RUNNING_AS_GID or SUDO_GID
|
||||
|
|
|
@ -303,7 +303,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
"Exit Status": [],
|
||||
}
|
||||
|
||||
from workers.supervisor_util import get_existing_supervisord_process
|
||||
from workers.supervisord_util import get_existing_supervisord_process
|
||||
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor is None:
|
||||
|
@ -373,7 +373,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert request.user.is_superuser, "Must be a superuser to view configuration settings."
|
||||
|
||||
from workers.supervisor_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME
|
||||
from workers.supervisord_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME
|
||||
|
||||
SOCK_FILE = get_sock_file()
|
||||
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME
|
||||
|
|
|
@ -21,7 +21,6 @@ from archivebox.misc.logging_util import printable_filesize
|
|||
from archivebox.search.admin import SearchResultsAdminMixin
|
||||
from archivebox.index.html import snapshot_icons
|
||||
from archivebox.extractors import archive_links
|
||||
from archivebox.main import remove
|
||||
|
||||
from archivebox.base_models.admin import ABIDModelAdmin
|
||||
from archivebox.workers.tasks import bg_archive_links, bg_add
|
||||
|
@ -321,7 +320,9 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
description="☠️ Delete"
|
||||
)
|
||||
def delete_snapshots(self, request, queryset):
|
||||
from archivebox.cli.archivebox_remove import remove
|
||||
remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
|
||||
|
|
1526
archivebox/main.py
1526
archivebox/main.py
File diff suppressed because it is too large
Load diff
|
@ -24,7 +24,7 @@ def check_data_folder() -> None:
|
|||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||
|
||||
archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()
|
||||
archive_dir_exists = os.path.isdir(ARCHIVE_DIR)
|
||||
if not archive_dir_exists:
|
||||
print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
|
||||
print(f' {DATA_DIR}', file=sys.stderr)
|
||||
|
|
|
@ -12,7 +12,7 @@ from pathlib import Path
|
|||
|
||||
from datetime import datetime, timezone
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
|
||||
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..index.schema import Link, ArchiveResult
|
||||
|
@ -228,7 +228,7 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
|
|||
print()
|
||||
|
||||
|
||||
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str='.'):
|
||||
def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'):
|
||||
args = ' '.join(subcommand_args)
|
||||
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
|
||||
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
||||
|
|
|
@ -20,11 +20,9 @@ from datetime import datetime, timedelta # noqa
|
|||
from django.conf import settings # noqa
|
||||
|
||||
from archivebox import CONSTANTS # noqa
|
||||
from ..main import * # noqa
|
||||
from ..cli import CLI_SUBCOMMANDS
|
||||
from archivebox.cli import * # noqa
|
||||
|
||||
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
CLI_COMMAND_NAMES = ", ".join(CLI_SUBCOMMANDS.keys())
|
||||
|
||||
if __name__ == '__main__':
|
||||
# load the rich extension for ipython for pretty printing
|
||||
|
@ -40,7 +38,7 @@ if __name__ == '__main__':
|
|||
prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]')
|
||||
prnt('[yellow4]# ArchiveBox Imports[/]')
|
||||
prnt('[yellow4]import archivebox[/]')
|
||||
prnt('[yellow4]from archivebox.main import {}[/]'.format(CLI_COMMAND_NAMES))
|
||||
prnt('[yellow4]from archivebox.cli import *[/]')
|
||||
prnt()
|
||||
|
||||
if console.width >= 80:
|
||||
|
|
|
@ -459,8 +459,8 @@ def load_plugins(plugins: Iterable[PluginId | ModuleType | Type] | Dict[PluginId
|
|||
PLUGINS_TO_LOAD = sorted(PLUGINS_TO_LOAD, key=lambda x: x['order'])
|
||||
|
||||
for plugin_info in PLUGINS_TO_LOAD:
|
||||
if '--version' not in sys.argv and '--help' not in sys.argv:
|
||||
print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr)
|
||||
# if '--version' not in sys.argv and '--help' not in sys.argv:
|
||||
# print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr)
|
||||
pm.register(plugin_info['module'])
|
||||
LOADED_PLUGINS[plugin_info['id']] = plugin_info
|
||||
# print('\x1b[2K', end='\r', flush=True, file=sys.stderr)
|
||||
|
|
|
@ -1,103 +1,103 @@
|
|||
import uuid
|
||||
from functools import wraps
|
||||
from django.db import connection, transaction
|
||||
from django.utils import timezone
|
||||
from huey.exceptions import TaskLockedException
|
||||
# import uuid
|
||||
# from functools import wraps
|
||||
# from django.db import connection, transaction
|
||||
# from django.utils import timezone
|
||||
# from huey.exceptions import TaskLockedException
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
# from archivebox.config import CONSTANTS
|
||||
|
||||
class SqliteSemaphore:
|
||||
def __init__(self, db_path, table_name, name, value=1, timeout=None):
|
||||
self.db_path = db_path
|
||||
self.table_name = table_name
|
||||
self.name = name
|
||||
self.value = value
|
||||
self.timeout = timeout or 86400 # Set a max age for lock holders
|
||||
# class SqliteSemaphore:
|
||||
# def __init__(self, db_path, table_name, name, value=1, timeout=None):
|
||||
# self.db_path = db_path
|
||||
# self.table_name = table_name
|
||||
# self.name = name
|
||||
# self.value = value
|
||||
# self.timeout = timeout or 86400 # Set a max age for lock holders
|
||||
|
||||
# Ensure the table exists
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.table_name} (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT,
|
||||
timestamp DATETIME
|
||||
)
|
||||
""")
|
||||
# # Ensure the table exists
|
||||
# with connection.cursor() as cursor:
|
||||
# cursor.execute(f"""
|
||||
# CREATE TABLE IF NOT EXISTS {self.table_name} (
|
||||
# id TEXT PRIMARY KEY,
|
||||
# name TEXT,
|
||||
# timestamp DATETIME
|
||||
# )
|
||||
# """)
|
||||
|
||||
def acquire(self, name=None):
|
||||
name = name or str(uuid.uuid4())
|
||||
now = timezone.now()
|
||||
expiration = now - timezone.timedelta(seconds=self.timeout)
|
||||
# def acquire(self, name=None):
|
||||
# name = name or str(uuid.uuid4())
|
||||
# now = timezone.now()
|
||||
# expiration = now - timezone.timedelta(seconds=self.timeout)
|
||||
|
||||
with transaction.atomic():
|
||||
# Remove expired locks
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(f"""
|
||||
DELETE FROM {self.table_name}
|
||||
WHERE name = %s AND timestamp < %s
|
||||
""", [self.name, expiration])
|
||||
# with transaction.atomic():
|
||||
# # Remove expired locks
|
||||
# with connection.cursor() as cursor:
|
||||
# cursor.execute(f"""
|
||||
# DELETE FROM {self.table_name}
|
||||
# WHERE name = %s AND timestamp < %s
|
||||
# """, [self.name, expiration])
|
||||
|
||||
# Try to acquire the lock
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(f"""
|
||||
INSERT INTO {self.table_name} (id, name, timestamp)
|
||||
SELECT %s, %s, %s
|
||||
WHERE (
|
||||
SELECT COUNT(*) FROM {self.table_name}
|
||||
WHERE name = %s
|
||||
) < %s
|
||||
""", [name, self.name, now, self.name, self.value])
|
||||
# # Try to acquire the lock
|
||||
# with connection.cursor() as cursor:
|
||||
# cursor.execute(f"""
|
||||
# INSERT INTO {self.table_name} (id, name, timestamp)
|
||||
# SELECT %s, %s, %s
|
||||
# WHERE (
|
||||
# SELECT COUNT(*) FROM {self.table_name}
|
||||
# WHERE name = %s
|
||||
# ) < %s
|
||||
# """, [name, self.name, now, self.name, self.value])
|
||||
|
||||
if cursor.rowcount > 0:
|
||||
return name
|
||||
# if cursor.rowcount > 0:
|
||||
# return name
|
||||
|
||||
# If we couldn't acquire the lock, remove our attempted entry
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(f"""
|
||||
DELETE FROM {self.table_name}
|
||||
WHERE id = %s AND name = %s
|
||||
""", [name, self.name])
|
||||
# # If we couldn't acquire the lock, remove our attempted entry
|
||||
# with connection.cursor() as cursor:
|
||||
# cursor.execute(f"""
|
||||
# DELETE FROM {self.table_name}
|
||||
# WHERE id = %s AND name = %s
|
||||
# """, [name, self.name])
|
||||
|
||||
return None
|
||||
# return None
|
||||
|
||||
def release(self, name):
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(f"""
|
||||
DELETE FROM {self.table_name}
|
||||
WHERE id = %s AND name = %s
|
||||
""", [name, self.name])
|
||||
return cursor.rowcount > 0
|
||||
# def release(self, name):
|
||||
# with connection.cursor() as cursor:
|
||||
# cursor.execute(f"""
|
||||
# DELETE FROM {self.table_name}
|
||||
# WHERE id = %s AND name = %s
|
||||
# """, [name, self.name])
|
||||
# return cursor.rowcount > 0
|
||||
|
||||
|
||||
LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3'
|
||||
# LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3'
|
||||
|
||||
|
||||
def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None):
|
||||
"""
|
||||
Lock which can be acquired multiple times (default = 1).
|
||||
# def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None):
|
||||
# """
|
||||
# Lock which can be acquired multiple times (default = 1).
|
||||
|
||||
NOTE: no provisions are made for blocking, waiting, or notifying. This is
|
||||
just a lock which can be acquired a configurable number of times.
|
||||
# NOTE: no provisions are made for blocking, waiting, or notifying. This is
|
||||
# just a lock which can be acquired a configurable number of times.
|
||||
|
||||
Example:
|
||||
# Example:
|
||||
|
||||
# Allow up to 3 workers to run this task concurrently. If the task is
|
||||
# locked, retry up to 2 times with a delay of 60s.
|
||||
@huey.task(retries=2, retry_delay=60)
|
||||
@lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3)
|
||||
def my_task():
|
||||
...
|
||||
"""
|
||||
sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout)
|
||||
def decorator(fn):
|
||||
@wraps(fn)
|
||||
def inner(*args, **kwargs):
|
||||
tid = sem.acquire()
|
||||
if tid is None:
|
||||
raise TaskLockedException(f'unable to acquire lock {lock_name}')
|
||||
try:
|
||||
return fn(*args, **kwargs)
|
||||
finally:
|
||||
sem.release(tid)
|
||||
return inner
|
||||
return decorator
|
||||
# # Allow up to 3 workers to run this task concurrently. If the task is
|
||||
# # locked, retry up to 2 times with a delay of 60s.
|
||||
# @huey.task(retries=2, retry_delay=60)
|
||||
# @lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3)
|
||||
# def my_task():
|
||||
# ...
|
||||
# """
|
||||
# sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout)
|
||||
# def decorator(fn):
|
||||
# @wraps(fn)
|
||||
# def inner(*args, **kwargs):
|
||||
# tid = sem.acquire()
|
||||
# if tid is None:
|
||||
# raise TaskLockedException(f'unable to acquire lock {lock_name}')
|
||||
# try:
|
||||
# return fn(*args, **kwargs)
|
||||
# finally:
|
||||
# sem.release(tid)
|
||||
# return inner
|
||||
# return decorator
|
||||
|
|
|
@ -8,7 +8,7 @@ from django_huey import db_task, task
|
|||
from huey_monitor.models import TaskModel
|
||||
from huey_monitor.tqdm import ProcessInfo
|
||||
|
||||
from .supervisor_util import get_or_create_supervisord_process
|
||||
from .supervisord_util import get_or_create_supervisord_process
|
||||
|
||||
# @db_task(queue="commands", context=True, schedule=1)
|
||||
# def scheduler_tick():
|
||||
|
|
|
@ -115,6 +115,8 @@ dependencies = [
|
|||
"abx-plugin-mercury>=2024.10.28",
|
||||
"abx-plugin-htmltotext>=2024.10.28",
|
||||
"python-statemachine>=2.3.6",
|
||||
"click>=8.1.7",
|
||||
"rich-click>=1.8.4",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
|
18
uv.lock
18
uv.lock
|
@ -658,6 +658,7 @@ dependencies = [
|
|||
{ name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "channels", extra = ["daphne"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "croniter", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "dateparser", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
|
@ -688,6 +689,7 @@ dependencies = [
|
|||
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "rich-argparse", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "sonic-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "supervisor", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
|
@ -784,6 +786,7 @@ requires-dist = [
|
|||
{ name = "atomicwrites", specifier = "==1.4.1" },
|
||||
{ name = "base32-crockford", specifier = "==0.3.0" },
|
||||
{ name = "channels", extras = ["daphne"], specifier = ">=4.1.0" },
|
||||
{ name = "click", specifier = ">=8.1.7" },
|
||||
{ name = "croniter", specifier = ">=3.0.3" },
|
||||
{ name = "dateparser", specifier = ">=1.2.0" },
|
||||
{ name = "django", specifier = ">=5.1.1,<6.0" },
|
||||
|
@ -821,6 +824,7 @@ requires-dist = [
|
|||
{ name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" },
|
||||
{ name = "rich", specifier = ">=13.8.0" },
|
||||
{ name = "rich-argparse", specifier = ">=1.5.2" },
|
||||
{ name = "rich-click", specifier = ">=1.8.4" },
|
||||
{ name = "setuptools", specifier = ">=74.1.0" },
|
||||
{ name = "sonic-client", specifier = ">=1.0.0" },
|
||||
{ name = "supervisor", specifier = ">=4.2.5" },
|
||||
|
@ -2806,6 +2810,20 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/25/45/54b95bb72bb17c27a7252bee5034955020b5869a33918b660ffc29cbf608/rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7", size = 20072 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rich-click"
|
||||
version = "1.8.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/fc/f4/e48dc2850662526a26fb0961aacb0162c6feab934312b109b748ae4efee2/rich_click-1.8.4.tar.gz", hash = "sha256:0f49471f04439269d0e66a6f43120f52d11d594869a2a0be600cfb12eb0616b9", size = 38247 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/84/f3/72f93d8494ee641bde76bfe1208cf4abc44c6f9448673762f6077bc162d6/rich_click-1.8.4-py3-none-any.whl", hash = "sha256:2d2841b3cebe610d5682baa1194beaf78ab00c4fa31931533261b5eba2ee80b7", size = 35071 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ruff"
|
||||
version = "0.7.4"
|
||||
|
|
Loading…
Reference in a new issue