mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 19:53:06 +00:00
move main funcs into cli files and switch to using click for CLI
This commit is contained in:
parent
569081a9eb
commit
328eb98a38
35 changed files with 1885 additions and 2296 deletions
|
@ -51,6 +51,7 @@ from .pkgs import load_vendored_pkgs # noqa
|
||||||
load_vendored_pkgs()
|
load_vendored_pkgs()
|
||||||
# print('DONE LOADING VENDORED LIBRARIES')
|
# print('DONE LOADING VENDORED LIBRARIES')
|
||||||
|
|
||||||
|
# print('LOADING ABX PLUGIN SPECIFICATIONS')
|
||||||
# Load ABX Plugin Specifications + Default Implementations
|
# Load ABX Plugin Specifications + Default Implementations
|
||||||
import abx # noqa
|
import abx # noqa
|
||||||
import abx_spec_archivebox # noqa
|
import abx_spec_archivebox # noqa
|
||||||
|
@ -74,7 +75,7 @@ abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
|
||||||
# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
|
# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
|
||||||
abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
|
abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
|
||||||
pm = abx.pm
|
pm = abx.pm
|
||||||
|
# print('DONE LOADING ABX PLUGIN SPECIFICATIONS')
|
||||||
|
|
||||||
# Load all pip-installed ABX-compatible plugins
|
# Load all pip-installed ABX-compatible plugins
|
||||||
ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
|
ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
|
||||||
|
@ -94,7 +95,9 @@ USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
|
||||||
|
|
||||||
# Import all plugins and register them with ABX Plugin Manager
|
# Import all plugins and register them with ABX Plugin Manager
|
||||||
ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
|
ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
|
||||||
|
# print('LOADING ALL PLUGINS')
|
||||||
LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
|
LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
|
||||||
|
# print('DONE LOADING ALL PLUGINS')
|
||||||
|
|
||||||
# Setup basic config, constants, paths, and version
|
# Setup basic config, constants, paths, and version
|
||||||
from .config.constants import CONSTANTS # noqa
|
from .config.constants import CONSTANTS # noqa
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""This is the main entry point for the ArchiveBox CLI."""
|
"""This is the entrypoint for python -m archivebox ..."""
|
||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
import archivebox # noqa # make sure monkey patches are applied before anything else
|
import archivebox # noqa # make sure monkey patches are applied before anything else
|
||||||
|
@ -15,5 +15,4 @@ ASCII_LOGO_MINI = r"""
|
||||||
/_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
|
/_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||||
|
|
|
@ -6,13 +6,6 @@ from enum import Enum
|
||||||
|
|
||||||
from ninja import Router, Schema
|
from ninja import Router, Schema
|
||||||
|
|
||||||
from archivebox.main import (
|
|
||||||
add,
|
|
||||||
remove,
|
|
||||||
update,
|
|
||||||
list_all,
|
|
||||||
schedule,
|
|
||||||
)
|
|
||||||
from archivebox.misc.util import ansi_to_html
|
from archivebox.misc.util import ansi_to_html
|
||||||
from archivebox.config.common import ARCHIVING_CONFIG
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
|
@ -60,13 +53,11 @@ class AddCommandSchema(Schema):
|
||||||
urls: List[str]
|
urls: List[str]
|
||||||
tag: str = ""
|
tag: str = ""
|
||||||
depth: int = 0
|
depth: int = 0
|
||||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
|
|
||||||
update_all: bool = False
|
|
||||||
index_only: bool = False
|
|
||||||
overwrite: bool = False
|
|
||||||
init: bool = False
|
|
||||||
extractors: str = ""
|
|
||||||
parser: str = "auto"
|
parser: str = "auto"
|
||||||
|
extract: str = ""
|
||||||
|
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
|
||||||
|
overwrite: bool = False
|
||||||
|
index_only: bool = False
|
||||||
|
|
||||||
class UpdateCommandSchema(Schema):
|
class UpdateCommandSchema(Schema):
|
||||||
resume: Optional[float] = 0
|
resume: Optional[float] = 0
|
||||||
|
@ -93,7 +84,7 @@ class ScheduleCommandSchema(Schema):
|
||||||
class ListCommandSchema(Schema):
|
class ListCommandSchema(Schema):
|
||||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||||
filter_type: str = FilterTypeChoices.substring
|
filter_type: str = FilterTypeChoices.substring
|
||||||
status: Optional[StatusChoices] = StatusChoices.indexed
|
status: StatusChoices = StatusChoices.indexed
|
||||||
after: Optional[float] = 0
|
after: Optional[float] = 0
|
||||||
before: Optional[float] = 999999999999999
|
before: Optional[float] = 999999999999999
|
||||||
sort: str = 'bookmarked_at'
|
sort: str = 'bookmarked_at'
|
||||||
|
@ -115,16 +106,16 @@ class RemoveCommandSchema(Schema):
|
||||||
|
|
||||||
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||||
def cli_add(request, args: AddCommandSchema):
|
def cli_add(request, args: AddCommandSchema):
|
||||||
|
from archivebox.cli.archivebox_add import add
|
||||||
|
|
||||||
result = add(
|
result = add(
|
||||||
urls=args.urls,
|
urls=args.urls,
|
||||||
tag=args.tag,
|
tag=args.tag,
|
||||||
depth=args.depth,
|
depth=args.depth,
|
||||||
update=args.update,
|
update=args.update,
|
||||||
update_all=args.update_all,
|
|
||||||
index_only=args.index_only,
|
index_only=args.index_only,
|
||||||
overwrite=args.overwrite,
|
overwrite=args.overwrite,
|
||||||
init=args.init,
|
extract=args.extract,
|
||||||
extractors=args.extractors,
|
|
||||||
parser=args.parser,
|
parser=args.parser,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -139,6 +130,8 @@ def cli_add(request, args: AddCommandSchema):
|
||||||
|
|
||||||
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||||
def cli_update(request, args: UpdateCommandSchema):
|
def cli_update(request, args: UpdateCommandSchema):
|
||||||
|
from archivebox.cli.archivebox_update import update
|
||||||
|
|
||||||
result = update(
|
result = update(
|
||||||
resume=args.resume,
|
resume=args.resume,
|
||||||
only_new=args.only_new,
|
only_new=args.only_new,
|
||||||
|
@ -162,6 +155,8 @@ def cli_update(request, args: UpdateCommandSchema):
|
||||||
|
|
||||||
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||||
def cli_schedule(request, args: ScheduleCommandSchema):
|
def cli_schedule(request, args: ScheduleCommandSchema):
|
||||||
|
from archivebox.cli.archivebox_schedule import schedule
|
||||||
|
|
||||||
result = schedule(
|
result = schedule(
|
||||||
import_path=args.import_path,
|
import_path=args.import_path,
|
||||||
add=args.add,
|
add=args.add,
|
||||||
|
@ -184,9 +179,11 @@ def cli_schedule(request, args: ScheduleCommandSchema):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns] (use this endpoint with ?filter_type=search to search for snapshots)')
|
@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
|
||||||
def cli_list(request, args: ListCommandSchema):
|
def cli_search(request, args: ListCommandSchema):
|
||||||
result = list_all(
|
from archivebox.cli.archivebox_search import search
|
||||||
|
|
||||||
|
result = search(
|
||||||
filter_patterns=args.filter_patterns,
|
filter_patterns=args.filter_patterns,
|
||||||
filter_type=args.filter_type,
|
filter_type=args.filter_type,
|
||||||
status=args.status,
|
status=args.status,
|
||||||
|
@ -221,6 +218,8 @@ def cli_list(request, args: ListCommandSchema):
|
||||||
|
|
||||||
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||||
def cli_remove(request, args: RemoveCommandSchema):
|
def cli_remove(request, args: RemoveCommandSchema):
|
||||||
|
from archivebox.cli.archivebox_remove import remove
|
||||||
|
|
||||||
result = remove(
|
result = remove(
|
||||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||||
delete=args.delete,
|
delete=args.delete,
|
||||||
|
|
|
@ -1,264 +1,117 @@
|
||||||
__package__ = 'archivebox.cli'
|
__package__ = 'archivebox.cli'
|
||||||
__command__ = 'archivebox'
|
__command__ = 'archivebox'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
|
||||||
import threading
|
|
||||||
|
|
||||||
from time import sleep
|
|
||||||
from collections.abc import Mapping
|
|
||||||
|
|
||||||
from rich import print
|
|
||||||
|
|
||||||
from typing import Optional, List, IO, Union, Iterable
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
|
|
||||||
BUILTIN_LIST = list
|
import rich_click as click
|
||||||
|
|
||||||
CLI_DIR = Path(__file__).resolve().parent
|
|
||||||
|
|
||||||
# rewrite setup -> install for backwards compatibility
|
|
||||||
if len(sys.argv) > 1 and sys.argv[1] == 'setup':
|
|
||||||
from rich import print
|
from rich import print
|
||||||
print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
|
|
||||||
sys.argv[1] = 'install'
|
from archivebox.config.version import VERSION
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if '--debug' in sys.argv:
|
if '--debug' in sys.argv:
|
||||||
os.environ['DEBUG'] = 'True'
|
os.environ['DEBUG'] = 'True'
|
||||||
sys.argv.remove('--debug')
|
sys.argv.remove('--debug')
|
||||||
|
|
||||||
|
|
||||||
# def list_subcommands() -> Dict[str, str]:
|
class ArchiveBoxGroup(click.Group):
|
||||||
# """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
|
"""lazy loading click group for archivebox commands"""
|
||||||
# COMMANDS = []
|
meta_commands = {
|
||||||
# for filename in os.listdir(CLI_DIR):
|
'help': 'archivebox.cli.archivebox_help.main',
|
||||||
# if is_cli_module(filename):
|
'version': 'archivebox.cli.archivebox_version.main',
|
||||||
# subcommand = filename.replace('archivebox_', '').replace('.py', '')
|
}
|
||||||
# module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
setup_commands = {
|
||||||
# assert is_valid_cli_module(module, subcommand)
|
'init': 'archivebox.cli.archivebox_init.main',
|
||||||
# COMMANDS.append((subcommand, module.main.__doc__))
|
'install': 'archivebox.cli.archivebox_install.main',
|
||||||
# globals()[subcommand] = module.main
|
}
|
||||||
# display_order = lambda cmd: (
|
archive_commands = {
|
||||||
# display_first.index(cmd[0])
|
'add': 'archivebox.cli.archivebox_add.main',
|
||||||
# if cmd[0] in display_first else
|
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||||
# 100 + len(cmd[0])
|
'update': 'archivebox.cli.archivebox_update.main',
|
||||||
# )
|
'search': 'archivebox.cli.archivebox_search.main',
|
||||||
# return dict(sorted(COMMANDS, key=display_order))
|
'status': 'archivebox.cli.archivebox_status.main',
|
||||||
|
'config': 'archivebox.cli.archivebox_config.main',
|
||||||
# just define it statically, it's much faster:
|
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||||
SUBCOMMAND_MODULES = {
|
'server': 'archivebox.cli.archivebox_server.main',
|
||||||
'help': 'archivebox_help',
|
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||||
'version': 'archivebox_version' ,
|
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||||
|
}
|
||||||
'init': 'archivebox_init',
|
all_subcommands = {
|
||||||
'install': 'archivebox_install',
|
**meta_commands,
|
||||||
##############################################
|
**setup_commands,
|
||||||
'config': 'archivebox_config',
|
**archive_commands,
|
||||||
'add': 'archivebox_add',
|
}
|
||||||
'remove': 'archivebox_remove',
|
renamed_commands = {
|
||||||
'update': 'archivebox_update',
|
'setup': 'install',
|
||||||
'list': 'archivebox_list',
|
'list': 'search',
|
||||||
'status': 'archivebox_status',
|
'import': 'add',
|
||||||
|
'archive': 'add',
|
||||||
'schedule': 'archivebox_schedule',
|
'export': 'search',
|
||||||
'server': 'archivebox_server',
|
|
||||||
'shell': 'archivebox_shell',
|
|
||||||
'manage': 'archivebox_manage',
|
|
||||||
|
|
||||||
# 'oneshot': 'archivebox_oneshot',
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# every imported command module must have these properties in order to be valid
|
|
||||||
required_attrs = ('__package__', '__command__', 'main')
|
|
||||||
|
|
||||||
# basic checks to make sure imported files are valid subcommands
|
def get_command(self, ctx, cmd_name):
|
||||||
is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
|
# handle renamed commands
|
||||||
is_valid_cli_module = lambda module, subcommand: (
|
if cmd_name in self.renamed_commands:
|
||||||
all(hasattr(module, attr) for attr in required_attrs)
|
new_name = self.renamed_commands[cmd_name]
|
||||||
and module.__command__.split(' ')[-1] == subcommand
|
print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
|
||||||
)
|
cmd_name = new_name
|
||||||
|
ctx.invoked_subcommand = cmd_name
|
||||||
|
|
||||||
class LazySubcommands(Mapping):
|
# handle lazy loading of commands
|
||||||
def keys(self):
|
if cmd_name in self.all_subcommands:
|
||||||
return SUBCOMMAND_MODULES.keys()
|
return self._lazy_load(cmd_name)
|
||||||
|
|
||||||
def values(self):
|
# fall-back to using click's default command lookup
|
||||||
return [self[key] for key in self.keys()]
|
return super().get_command(ctx, cmd_name)
|
||||||
|
|
||||||
def items(self):
|
@classmethod
|
||||||
return [(key, self[key]) for key in self.keys()]
|
def _lazy_load(cls, cmd_name):
|
||||||
|
import_path = cls.all_subcommands[cmd_name]
|
||||||
|
modname, funcname = import_path.rsplit('.', 1)
|
||||||
|
|
||||||
def __getitem__(self, key):
|
# print(f'LAZY LOADING {import_path}')
|
||||||
module = import_module(f'.{SUBCOMMAND_MODULES[key]}', __package__)
|
mod = import_module(modname)
|
||||||
assert is_valid_cli_module(module, key)
|
func = getattr(mod, funcname)
|
||||||
return module.main
|
|
||||||
|
|
||||||
def __iter__(self):
|
if not hasattr(func, '__doc__'):
|
||||||
return iter(SUBCOMMAND_MODULES.keys())
|
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
|
||||||
|
|
||||||
def __len__(self):
|
# if not isinstance(cmd, click.BaseCommand):
|
||||||
return len(SUBCOMMAND_MODULES)
|
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||||
|
|
||||||
CLI_SUBCOMMANDS = LazySubcommands()
|
return func
|
||||||
|
|
||||||
|
|
||||||
# these common commands will appear sorted before any others for ease-of-use
|
@click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
|
||||||
meta_cmds = ('help', 'version') # dont require valid data folder at all
|
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||||
setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet
|
@click.version_option(version=VERSION, package_name='archivebox', message='%(version)s')
|
||||||
archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present
|
@click.pass_context
|
||||||
fake_db = ("oneshot",) # use fake in-memory db
|
def cli(ctx, help=False):
|
||||||
|
"""ArchiveBox: The self-hosted internet archive"""
|
||||||
|
|
||||||
display_first = (*meta_cmds, *setup_cmds, *archive_cmds)
|
if help or ctx.invoked_subcommand is None:
|
||||||
|
ctx.invoke(ctx.command.get_command(ctx, 'help'))
|
||||||
|
|
||||||
|
if ctx.invoked_subcommand in ArchiveBoxGroup.archive_commands:
|
||||||
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
|
# print('SETUP DJANGO AND CHECK DATA FOLDER')
|
||||||
|
|
||||||
|
|
||||||
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int:
|
|
||||||
"""
|
|
||||||
Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks.
|
|
||||||
Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes.
|
|
||||||
"""
|
|
||||||
|
|
||||||
wait_for_all: bool = thread_names == ()
|
|
||||||
|
|
||||||
thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns)
|
|
||||||
|
|
||||||
should_wait = lambda thread: (
|
|
||||||
not thread_matches(thread, ignore_names)
|
|
||||||
and (wait_for_all or thread_matches(thread, thread_names)))
|
|
||||||
|
|
||||||
for tries in range(timeout):
|
|
||||||
all_threads = [*threading.enumerate()]
|
|
||||||
blocking_threads = [*filter(should_wait, all_threads)]
|
|
||||||
threads_summary = ', '.join(repr(t) for t in blocking_threads)
|
|
||||||
if blocking_threads:
|
|
||||||
sleep(1)
|
|
||||||
if tries == 5: # only show stderr message if we need to wait more than 5s
|
|
||||||
print(
|
|
||||||
f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
|
|
||||||
threads_summary,
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return tries
|
|
||||||
|
|
||||||
raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run_subcommand(subcommand: str,
|
|
||||||
subcommand_args: List[str] | None = None,
|
|
||||||
stdin: Optional[IO]=None,
|
|
||||||
pwd: Union[Path, str, None]=None) -> None:
|
|
||||||
"""Run a given ArchiveBox subcommand with the given list of args"""
|
|
||||||
|
|
||||||
subcommand_args = subcommand_args or []
|
|
||||||
|
|
||||||
from archivebox.misc.checks import check_migrations
|
|
||||||
from archivebox.config.django import setup_django
|
from archivebox.config.django import setup_django
|
||||||
|
from archivebox.misc.checks import check_data_folder
|
||||||
|
setup_django()
|
||||||
|
check_data_folder()
|
||||||
|
|
||||||
# print('DATA_DIR is', DATA_DIR)
|
def main(args=None, prog_name=None):
|
||||||
# print('pwd is', os.getcwd())
|
# show `docker run archivebox xyz` in help messages if running in docker
|
||||||
|
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||||
cmd_requires_db = (subcommand in archive_cmds)
|
prog_name = prog_name or ('docker compose run archivebox' if IN_DOCKER else 'archivebox')
|
||||||
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
|
|
||||||
|
|
||||||
check_db = cmd_requires_db and not init_pending
|
|
||||||
|
|
||||||
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
|
|
||||||
|
|
||||||
for ignore_pattern in ('help', '-h', '--help', 'version', '--version'):
|
|
||||||
if ignore_pattern in sys.argv[:4]:
|
|
||||||
cmd_requires_db = False
|
|
||||||
break
|
|
||||||
|
|
||||||
if subcommand in archive_cmds:
|
|
||||||
if cmd_requires_db:
|
|
||||||
check_migrations()
|
|
||||||
|
|
||||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
|
||||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
|
||||||
|
|
||||||
# wait for webhooks, signals, and other background jobs to finish before exit
|
|
||||||
wait_for_bg_threads_to_exit(timeout=60)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class NotProvided:
|
|
||||||
def __len__(self):
|
|
||||||
return 0
|
|
||||||
def __bool__(self):
|
|
||||||
return False
|
|
||||||
def __repr__(self):
|
|
||||||
return '<not provided>'
|
|
||||||
|
|
||||||
Omitted = Union[None, NotProvided]
|
|
||||||
|
|
||||||
OMITTED = NotProvided()
|
|
||||||
|
|
||||||
|
|
||||||
def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: str | None=None) -> None:
|
|
||||||
# print('STARTING CLI MAIN ENTRYPOINT')
|
|
||||||
|
|
||||||
args = sys.argv[1:] if args is OMITTED else args
|
|
||||||
stdin = sys.stdin if stdin is OMITTED else stdin
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog=__command__,
|
|
||||||
description='ArchiveBox: The self-hosted internet archive',
|
|
||||||
add_help=False,
|
|
||||||
)
|
|
||||||
group = parser.add_mutually_exclusive_group()
|
|
||||||
group.add_argument(
|
|
||||||
'--help', '-h',
|
|
||||||
action='store_true',
|
|
||||||
help=CLI_SUBCOMMANDS['help'].__doc__,
|
|
||||||
)
|
|
||||||
group.add_argument(
|
|
||||||
'--version',
|
|
||||||
action='store_true',
|
|
||||||
help=CLI_SUBCOMMANDS['version'].__doc__,
|
|
||||||
)
|
|
||||||
group.add_argument(
|
|
||||||
"subcommand",
|
|
||||||
type=str,
|
|
||||||
help= "The name of the subcommand to run",
|
|
||||||
nargs='?',
|
|
||||||
choices=CLI_SUBCOMMANDS.keys(),
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"subcommand_args",
|
|
||||||
help="Arguments for the subcommand",
|
|
||||||
nargs=argparse.REMAINDER,
|
|
||||||
)
|
|
||||||
command = parser.parse_args(args or ())
|
|
||||||
|
|
||||||
if command.version:
|
|
||||||
command.subcommand = 'version'
|
|
||||||
elif command.help or command.subcommand is None:
|
|
||||||
command.subcommand = 'help'
|
|
||||||
|
|
||||||
if command.subcommand not in ('version',):
|
|
||||||
from archivebox.misc.logging_util import log_cli_command
|
|
||||||
|
|
||||||
log_cli_command(
|
|
||||||
subcommand=command.subcommand,
|
|
||||||
subcommand_args=command.subcommand_args,
|
|
||||||
stdin=stdin or None,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
run_subcommand(
|
cli(args=args, prog_name=prog_name)
|
||||||
subcommand=command.subcommand,
|
|
||||||
subcommand_args=command.subcommand_args,
|
|
||||||
stdin=stdin or None,
|
|
||||||
)
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
|
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
|
@ -4,10 +4,10 @@ __package__ = 'archivebox.cli'
|
||||||
__command__ = 'archivebox add'
|
__command__ = 'archivebox add'
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
|
||||||
|
|
||||||
from typing import IO, TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import rich_click as click
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
@ -18,7 +18,6 @@ from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
from archivebox.config.django import setup_django
|
from archivebox.config.django import setup_django
|
||||||
from archivebox.config.permissions import USER, HOSTNAME
|
from archivebox.config.permissions import USER, HOSTNAME
|
||||||
from archivebox.misc.checks import check_data_folder
|
from archivebox.misc.checks import check_data_folder
|
||||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
|
|
||||||
from archivebox.parsers import PARSERS
|
from archivebox.parsers import PARSERS
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,22 +28,142 @@ if TYPE_CHECKING:
|
||||||
ORCHESTRATOR = None
|
ORCHESTRATOR = None
|
||||||
|
|
||||||
|
|
||||||
|
# OLD VERSION:
|
||||||
|
# def add(urls: Union[str, List[str]],
|
||||||
|
# tag: str='',
|
||||||
|
# depth: int=0,
|
||||||
|
# update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||||
|
# update_all: bool=False,
|
||||||
|
# index_only: bool=False,
|
||||||
|
# overwrite: bool=False,
|
||||||
|
# # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
|
||||||
|
# init: bool=False,
|
||||||
|
# extractors: str="",
|
||||||
|
# parser: str="auto",
|
||||||
|
# created_by_id: int | None=None,
|
||||||
|
# out_dir: Path=DATA_DIR) -> List[Link]:
|
||||||
|
# """Add a new URL or list of URLs to your archive"""
|
||||||
|
|
||||||
|
# from core.models import Snapshot, Tag
|
||||||
|
# # from workers.supervisord_util import start_cli_workers, tail_worker_logs
|
||||||
|
# # from workers.tasks import bg_archive_link
|
||||||
|
|
||||||
|
|
||||||
|
# assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||||
|
|
||||||
|
# extractors = extractors.split(",") if extractors else []
|
||||||
|
|
||||||
|
# if init:
|
||||||
|
# run_subcommand('init', stdin=None, pwd=out_dir)
|
||||||
|
|
||||||
|
# # Load list of links from the existing index
|
||||||
|
# check_data_folder()
|
||||||
|
|
||||||
|
# # worker = start_cli_workers()
|
||||||
|
|
||||||
|
# new_links: List[Link] = []
|
||||||
|
# all_links = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
|
# log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
||||||
|
# if isinstance(urls, str):
|
||||||
|
# # save verbatim stdin to sources
|
||||||
|
# write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
|
||||||
|
# elif isinstance(urls, list):
|
||||||
|
# # save verbatim args to sources
|
||||||
|
# write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||||
|
|
||||||
|
|
||||||
|
# new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
|
||||||
|
|
||||||
|
# # If we're going one level deeper, download each link and look for more links
|
||||||
|
# new_links_depth = []
|
||||||
|
# if new_links and depth == 1:
|
||||||
|
# log_crawl_started(new_links)
|
||||||
|
# for new_link in new_links:
|
||||||
|
# try:
|
||||||
|
# downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||||
|
# new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||||
|
# except Exception as err:
|
||||||
|
# stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
|
||||||
|
|
||||||
|
# imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||||
|
|
||||||
|
# new_links = dedupe_links(all_links, imported_links)
|
||||||
|
|
||||||
|
# write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
|
||||||
|
# all_links = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
|
# tags = [
|
||||||
|
# Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
|
||||||
|
# for name in tag.split(',')
|
||||||
|
# if name.strip()
|
||||||
|
# ]
|
||||||
|
# if tags:
|
||||||
|
# for link in imported_links:
|
||||||
|
# snapshot = Snapshot.objects.get(url=link.url)
|
||||||
|
# snapshot.tags.add(*tags)
|
||||||
|
# snapshot.tags_str(nocache=True)
|
||||||
|
# snapshot.save()
|
||||||
|
# # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
|
||||||
|
|
||||||
|
# if index_only:
|
||||||
|
# # mock archive all the links using the fake index_only extractor method in order to update their state
|
||||||
|
# if overwrite:
|
||||||
|
# archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
|
||||||
|
# else:
|
||||||
|
# archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
|
||||||
|
# else:
|
||||||
|
# # fully run the archive extractor methods for each link
|
||||||
|
# archive_kwargs = {
|
||||||
|
# "out_dir": out_dir,
|
||||||
|
# "created_by_id": created_by_id,
|
||||||
|
# }
|
||||||
|
# if extractors:
|
||||||
|
# archive_kwargs["methods"] = extractors
|
||||||
|
|
||||||
|
# stderr()
|
||||||
|
|
||||||
|
# ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
# if update:
|
||||||
|
# stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||||
|
# archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
|
||||||
|
# elif update_all:
|
||||||
|
# stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
|
||||||
|
# archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||||
|
# elif overwrite:
|
||||||
|
# stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||||
|
# archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||||
|
# elif new_links:
|
||||||
|
# stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
|
||||||
|
# archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||||
|
|
||||||
|
# # tail_worker_logs(worker['stdout_logfile'])
|
||||||
|
|
||||||
|
# # if CAN_UPGRADE:
|
||||||
|
# # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||||
|
|
||||||
|
# return new_links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def add(urls: str | list[str],
|
def add(urls: str | list[str],
|
||||||
tag: str='',
|
|
||||||
depth: int=0,
|
depth: int=0,
|
||||||
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
tag: str='',
|
||||||
update_all: bool=False,
|
|
||||||
index_only: bool=False,
|
|
||||||
overwrite: bool=False,
|
|
||||||
extractors: str="",
|
|
||||||
parser: str="auto",
|
parser: str="auto",
|
||||||
|
extract: str="",
|
||||||
persona: str='Default',
|
persona: str='Default',
|
||||||
|
overwrite: bool=False,
|
||||||
|
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||||
|
index_only: bool=False,
|
||||||
bg: bool=False,
|
bg: bool=False,
|
||||||
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
||||||
"""Add a new URL or list of URLs to your archive"""
|
"""Add a new URL or list of URLs to your archive"""
|
||||||
|
|
||||||
global ORCHESTRATOR
|
global ORCHESTRATOR
|
||||||
|
|
||||||
|
depth = int(depth)
|
||||||
|
|
||||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||||
|
|
||||||
# 0. setup abx, django, check_data_folder
|
# 0. setup abx, django, check_data_folder
|
||||||
|
@ -56,7 +175,6 @@ def add(urls: str | list[str],
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||||
|
|
||||||
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
|
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
|
||||||
|
@ -72,7 +190,7 @@ def add(urls: str | list[str],
|
||||||
'ONLY_NEW': not update,
|
'ONLY_NEW': not update,
|
||||||
'INDEX_ONLY': index_only,
|
'INDEX_ONLY': index_only,
|
||||||
'OVERWRITE': overwrite,
|
'OVERWRITE': overwrite,
|
||||||
'EXTRACTORS': extractors,
|
'EXTRACTORS': extract,
|
||||||
'DEFAULT_PERSONA': persona or 'Default',
|
'DEFAULT_PERSONA': persona or 'Default',
|
||||||
})
|
})
|
||||||
# 3. create a new Crawl pointing to the Seed
|
# 3. create a new Crawl pointing to the Seed
|
||||||
|
@ -91,118 +209,23 @@ def add(urls: str | list[str],
|
||||||
return crawl.snapshot_set.all()
|
return crawl.snapshot_set.all()
|
||||||
|
|
||||||
|
|
||||||
def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=None) -> None:
|
@click.command()
|
||||||
|
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
|
||||||
|
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||||
|
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
|
||||||
|
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||||
|
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||||
|
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||||
|
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||||
|
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||||
|
# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
|
||||||
|
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
|
||||||
|
@click.argument('urls', nargs=-1, type=click.Path())
|
||||||
|
def main(**kwargs):
|
||||||
"""Add a new URL or list of URLs to your archive"""
|
"""Add a new URL or list of URLs to your archive"""
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog=__command__,
|
|
||||||
description=add.__doc__,
|
|
||||||
add_help=True,
|
|
||||||
formatter_class=SmartFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--tag', '-t',
|
|
||||||
type=str,
|
|
||||||
default='',
|
|
||||||
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--update', #'-u',
|
|
||||||
action='store_true',
|
|
||||||
default=not ARCHIVING_CONFIG.ONLY_NEW, # when ONLY_NEW=True we skip updating old links
|
|
||||||
help="Also retry previously skipped/failed links when adding new links",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--update-all', #'-n',
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help="Also update ALL links in index when finished adding new links",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--index-only', #'-o',
|
|
||||||
action='store_true',
|
|
||||||
help="Add the links to the main index without archiving them",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'urls',
|
|
||||||
nargs='*',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help=(
|
|
||||||
'URLs or paths to archive e.g.:\n'
|
|
||||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
|
||||||
' https://example.com/some/rss/feed.xml\n'
|
|
||||||
' https://example.com\n'
|
|
||||||
' ~/Downloads/firefox_bookmarks_export.html\n'
|
|
||||||
' ~/Desktop/sites_list.csv\n'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--depth",
|
|
||||||
action="store",
|
|
||||||
default=0,
|
|
||||||
choices=[0, 1],
|
|
||||||
type=int,
|
|
||||||
help="Recursively archive all linked pages up to this many hops away"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--overwrite",
|
|
||||||
default=False,
|
|
||||||
action="store_true",
|
|
||||||
help="Re-archive URLs from scratch, overwriting any existing files"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--extract", '-e',
|
|
||||||
type=str,
|
|
||||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
|
||||||
This does not take precedence over the configuration",
|
|
||||||
default=""
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--parser",
|
|
||||||
type=str,
|
|
||||||
help="Parser used to read inputted URLs.",
|
|
||||||
default="auto",
|
|
||||||
choices=["auto", *PARSERS.keys()],
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--persona",
|
|
||||||
type=str,
|
|
||||||
help="Name of accounts persona to use when archiving.",
|
|
||||||
default="Default",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--bg",
|
|
||||||
default=False,
|
|
||||||
action="store_true",
|
|
||||||
help="Enqueue a background worker to complete the crawl instead of running it immediately",
|
|
||||||
)
|
|
||||||
command = parser.parse_args(args or ())
|
|
||||||
urls = command.urls
|
|
||||||
|
|
||||||
stdin_urls = ''
|
add(**kwargs)
|
||||||
if not urls:
|
|
||||||
stdin_urls = accept_stdin(stdin)
|
|
||||||
|
|
||||||
if (stdin_urls and urls) or (not stdin and not urls):
|
|
||||||
stderr(
|
|
||||||
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
|
|
||||||
color='red',
|
|
||||||
)
|
|
||||||
raise SystemExit(2)
|
|
||||||
add(
|
|
||||||
urls=stdin_urls or urls,
|
|
||||||
depth=command.depth,
|
|
||||||
tag=command.tag,
|
|
||||||
update=command.update,
|
|
||||||
update_all=command.update_all,
|
|
||||||
index_only=command.index_only,
|
|
||||||
overwrite=command.overwrite,
|
|
||||||
extractors=command.extract,
|
|
||||||
parser=command.parser,
|
|
||||||
persona=command.persona,
|
|
||||||
bg=command.bg,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
main()
|
||||||
|
|
|
@ -12,7 +12,130 @@ from typing import Optional, List, IO
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||||
from ..main import config
|
|
||||||
|
|
||||||
|
|
||||||
|
# @enforce_types
|
||||||
|
def config(config_options_str: Optional[str]=None,
|
||||||
|
config_options: Optional[List[str]]=None,
|
||||||
|
get: bool=False,
|
||||||
|
set: bool=False,
|
||||||
|
search: bool=False,
|
||||||
|
reset: bool=False,
|
||||||
|
out_dir: Path=DATA_DIR) -> None:
|
||||||
|
"""Get and set your ArchiveBox project configuration values"""
|
||||||
|
|
||||||
|
from rich import print
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
if config_options and config_options_str:
|
||||||
|
stderr(
|
||||||
|
'[X] You should either pass config values as an arguments '
|
||||||
|
'or via stdin, but not both.\n',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
raise SystemExit(2)
|
||||||
|
elif config_options_str:
|
||||||
|
config_options = config_options_str.split('\n')
|
||||||
|
|
||||||
|
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||||
|
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||||
|
|
||||||
|
config_options = config_options or []
|
||||||
|
|
||||||
|
no_args = not (get or set or reset or config_options)
|
||||||
|
|
||||||
|
matching_config = {}
|
||||||
|
if search:
|
||||||
|
if config_options:
|
||||||
|
config_options = [get_real_name(key) for key in config_options]
|
||||||
|
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||||
|
for config_section in CONFIGS.values():
|
||||||
|
aliases = config_section.aliases
|
||||||
|
|
||||||
|
for search_key in config_options:
|
||||||
|
# search all aliases in the section
|
||||||
|
for alias_key, key in aliases.items():
|
||||||
|
if search_key.lower() in alias_key.lower():
|
||||||
|
matching_config[key] = config_section.model_dump()[key]
|
||||||
|
|
||||||
|
# search all keys and values in the section
|
||||||
|
for existing_key, value in config_section.model_dump().items():
|
||||||
|
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
|
||||||
|
matching_config[existing_key] = value
|
||||||
|
|
||||||
|
print(printable_config(matching_config))
|
||||||
|
raise SystemExit(not matching_config)
|
||||||
|
elif get or no_args:
|
||||||
|
if config_options:
|
||||||
|
config_options = [get_real_name(key) for key in config_options]
|
||||||
|
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||||
|
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
|
||||||
|
if failed_config:
|
||||||
|
stderr()
|
||||||
|
stderr('[X] These options failed to get', color='red')
|
||||||
|
stderr(' {}'.format('\n '.join(config_options)))
|
||||||
|
raise SystemExit(1)
|
||||||
|
else:
|
||||||
|
matching_config = FLAT_CONFIG
|
||||||
|
|
||||||
|
print(printable_config(matching_config))
|
||||||
|
raise SystemExit(not matching_config)
|
||||||
|
elif set:
|
||||||
|
new_config = {}
|
||||||
|
failed_options = []
|
||||||
|
for line in config_options:
|
||||||
|
if line.startswith('#') or not line.strip():
|
||||||
|
continue
|
||||||
|
if '=' not in line:
|
||||||
|
stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
|
||||||
|
stderr(f' {line}')
|
||||||
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
raw_key, val = line.split('=', 1)
|
||||||
|
raw_key = raw_key.upper().strip()
|
||||||
|
key = get_real_name(raw_key)
|
||||||
|
if key != raw_key:
|
||||||
|
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
|
||||||
|
|
||||||
|
if key in FLAT_CONFIG:
|
||||||
|
new_config[key] = val.strip()
|
||||||
|
else:
|
||||||
|
failed_options.append(line)
|
||||||
|
|
||||||
|
if new_config:
|
||||||
|
before = FLAT_CONFIG
|
||||||
|
matching_config = write_config_file(new_config)
|
||||||
|
after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
|
||||||
|
print(printable_config(matching_config))
|
||||||
|
|
||||||
|
side_effect_changes = {}
|
||||||
|
for key, val in after.items():
|
||||||
|
if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
|
||||||
|
side_effect_changes[key] = after[key]
|
||||||
|
# import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
|
if side_effect_changes:
|
||||||
|
stderr()
|
||||||
|
stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
|
||||||
|
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
|
||||||
|
if failed_options:
|
||||||
|
stderr()
|
||||||
|
stderr('[X] These options failed to set (check for typos):', color='red')
|
||||||
|
stderr(' {}'.format('\n '.join(failed_options)))
|
||||||
|
raise SystemExit(1)
|
||||||
|
elif reset:
|
||||||
|
stderr('[X] This command is not implemented yet.', color='red')
|
||||||
|
stderr(' Please manually remove the relevant lines from your config file:')
|
||||||
|
raise SystemExit(2)
|
||||||
|
else:
|
||||||
|
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
|
||||||
|
stderr(' archivebox config')
|
||||||
|
stderr(' archivebox config --get SOME_KEY')
|
||||||
|
stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
|
||||||
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(config.__doc__)
|
@docstring(config.__doc__)
|
||||||
|
|
|
@ -1,32 +1,105 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
__package__ = 'archivebox.cli'
|
__package__ = 'archivebox.cli'
|
||||||
__command__ = 'archivebox help'
|
__command__ = 'archivebox help'
|
||||||
|
|
||||||
import sys
|
import os
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, IO
|
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
import click
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from rich import print
|
||||||
from archivebox.config import DATA_DIR
|
from rich.panel import Panel
|
||||||
from ..main import help
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(help.__doc__)
|
def help() -> None:
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
"""Print the ArchiveBox help message and usage"""
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog=__command__,
|
from archivebox.cli import ArchiveBoxGroup
|
||||||
description=help.__doc__,
|
from archivebox.config import CONSTANTS
|
||||||
add_help=True,
|
from archivebox.config.permissions import IN_DOCKER
|
||||||
formatter_class=SmartFormatter,
|
from archivebox.misc.logging_util import log_cli_command
|
||||||
|
|
||||||
|
log_cli_command('help', [], None, '.')
|
||||||
|
|
||||||
|
COMMANDS_HELP_TEXT = '\n '.join(
|
||||||
|
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||||
|
for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||||
|
) + '\n\n ' + '\n '.join(
|
||||||
|
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||||
|
for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||||
|
) + '\n\n ' + '\n '.join(
|
||||||
|
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||||
|
for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||||
)
|
)
|
||||||
parser.parse_args(args or ())
|
|
||||||
reject_stdin(__command__, stdin)
|
|
||||||
|
|
||||||
help(out_dir=Path(pwd) if pwd else DATA_DIR)
|
DOCKER_USAGE = '''
|
||||||
|
[dodger_blue3]Docker Usage:[/dodger_blue3]
|
||||||
|
[grey53]# using Docker Compose:[/grey53]
|
||||||
|
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||||
|
|
||||||
|
[grey53]# using Docker:[/grey53]
|
||||||
|
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||||
|
''' if IN_DOCKER else ''
|
||||||
|
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
|
||||||
|
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
|
||||||
|
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
|
||||||
|
|
||||||
|
print(f'''{DOCKER_USAGE}
|
||||||
|
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
|
||||||
|
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||||
|
|
||||||
|
[deep_sky_blue4]Commands:[/deep_sky_blue4]
|
||||||
|
{COMMANDS_HELP_TEXT}
|
||||||
|
|
||||||
|
[deep_sky_blue4]Documentation:[/deep_sky_blue4]
|
||||||
|
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
|
||||||
|
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
|
||||||
|
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
|
||||||
|
''')
|
||||||
|
|
||||||
|
|
||||||
|
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
|
||||||
|
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
|
||||||
|
EXAMPLE_USAGE = f'''
|
||||||
|
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
|
||||||
|
|
||||||
|
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
|
||||||
|
[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# make sure database is up-to-date (safe to run multiple times)[/grey53]
|
||||||
|
[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53]
|
||||||
|
[dark_green]archivebox[/dark_green] [green]status[/green] [grey53]# get a health checkup report on your collection[/grey53]
|
||||||
|
[dark_green]archivebox[/dark_green] [green]update[/green] [grey53]# retry any previously failed or interrupted archiving tasks[/grey53]
|
||||||
|
|
||||||
|
[violet]Hint:[/violet] [i]More example usage:[/i]
|
||||||
|
[dark_green]archivebox[/dark_green] [green]add[/green] --depth=1 "https://example.com/some/page"
|
||||||
|
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
|
||||||
|
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
|
||||||
|
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
|
||||||
|
'''
|
||||||
|
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
|
||||||
|
else:
|
||||||
|
DATA_SETUP_HELP = '\n'
|
||||||
|
if IN_DOCKER:
|
||||||
|
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
|
||||||
|
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
|
||||||
|
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
|
||||||
|
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||||
|
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
|
||||||
|
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||||
|
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
|
||||||
|
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
|
||||||
|
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||||
|
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
|
||||||
|
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
|
||||||
|
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||||
|
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
|
||||||
|
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||||
|
def main(**kwargs):
|
||||||
|
"""Print the ArchiveBox help message and usage"""
|
||||||
|
return help()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
main()
|
||||||
|
|
|
@ -5,13 +5,193 @@ __command__ = 'archivebox init'
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||||
from ..main import init
|
|
||||||
|
|
||||||
|
def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Path=DATA_DIR) -> None:
|
||||||
|
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||||
|
|
||||||
|
from core.models import Snapshot
|
||||||
|
from rich import print
|
||||||
|
|
||||||
|
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
|
||||||
|
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
|
||||||
|
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
|
||||||
|
|
||||||
|
is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
|
||||||
|
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
|
||||||
|
if is_empty and not existing_index:
|
||||||
|
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
|
||||||
|
print('[green]----------------------------------------------------------------------[/green]')
|
||||||
|
elif existing_index:
|
||||||
|
# TODO: properly detect and print the existing version in current index as well
|
||||||
|
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
|
||||||
|
print('[green]----------------------------------------------------------------------[/green]')
|
||||||
|
else:
|
||||||
|
if force:
|
||||||
|
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
|
||||||
|
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||||
|
" You must run init in a completely empty directory, or an existing data folder.\n\n"
|
||||||
|
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
|
||||||
|
" then run and run 'archivebox init' to pick up where you left off.\n\n"
|
||||||
|
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
if existing_index:
|
||||||
|
print('\n[green][*] Verifying archive folder structure...[/green]')
|
||||||
|
else:
|
||||||
|
print('\n[green][+] Building archive folder structure...[/green]')
|
||||||
|
|
||||||
|
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
|
||||||
|
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
||||||
|
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||||
|
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
|
||||||
|
|
||||||
|
# create the .archivebox_id file with a unique ID for this collection
|
||||||
|
from archivebox.config.paths import _get_collection_id
|
||||||
|
_get_collection_id(CONSTANTS.DATA_DIR, force_create=True)
|
||||||
|
|
||||||
|
# create the ArchiveBox.conf file
|
||||||
|
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
|
||||||
|
|
||||||
|
|
||||||
|
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
|
||||||
|
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
|
||||||
|
else:
|
||||||
|
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
|
||||||
|
|
||||||
|
for migration_line in apply_migrations(out_dir):
|
||||||
|
sys.stdout.write(f' {migration_line}\n')
|
||||||
|
|
||||||
|
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
|
||||||
|
print()
|
||||||
|
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
|
||||||
|
|
||||||
|
# from django.contrib.auth.models import User
|
||||||
|
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||||
|
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
# call_command("createsuperuser", interactive=True)
|
||||||
|
|
||||||
|
print()
|
||||||
|
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
|
||||||
|
|
||||||
|
all_links = Snapshot.objects.none()
|
||||||
|
pending_links: Dict[str, Link] = {}
|
||||||
|
|
||||||
|
if existing_index:
|
||||||
|
all_links = load_main_index(out_dir=out_dir, warn=False)
|
||||||
|
print(f' √ Loaded {all_links.count()} links from existing main index.')
|
||||||
|
|
||||||
|
if quick:
|
||||||
|
print(' > Skipping full snapshot directory check (quick mode)')
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
# Links in data folders that dont match their timestamp
|
||||||
|
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
||||||
|
if fixed:
|
||||||
|
print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]')
|
||||||
|
if cant_fix:
|
||||||
|
print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]')
|
||||||
|
|
||||||
|
# Links in JSON index but not in main index
|
||||||
|
orphaned_json_links = {
|
||||||
|
link.url: link
|
||||||
|
for link in parse_json_main_index(out_dir)
|
||||||
|
if not all_links.filter(url=link.url).exists()
|
||||||
|
}
|
||||||
|
if orphaned_json_links:
|
||||||
|
pending_links.update(orphaned_json_links)
|
||||||
|
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
|
||||||
|
|
||||||
|
# Links in data dir indexes but not in main index
|
||||||
|
orphaned_data_dir_links = {
|
||||||
|
link.url: link
|
||||||
|
for link in parse_json_links_details(out_dir)
|
||||||
|
if not all_links.filter(url=link.url).exists()
|
||||||
|
}
|
||||||
|
if orphaned_data_dir_links:
|
||||||
|
pending_links.update(orphaned_data_dir_links)
|
||||||
|
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
|
||||||
|
|
||||||
|
# Links in invalid/duplicate data dirs
|
||||||
|
invalid_folders = {
|
||||||
|
folder: link
|
||||||
|
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
|
||||||
|
}
|
||||||
|
if invalid_folders:
|
||||||
|
print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]')
|
||||||
|
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items()))
|
||||||
|
print()
|
||||||
|
print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:')
|
||||||
|
print(' archivebox status')
|
||||||
|
print(' archivebox list --status=invalid')
|
||||||
|
|
||||||
|
except (KeyboardInterrupt, SystemExit):
|
||||||
|
print(file=sys.stderr)
|
||||||
|
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
|
||||||
|
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
|
||||||
|
print(file=sys.stderr)
|
||||||
|
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
|
||||||
|
print(' archivebox init --quick', file=sys.stderr)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
||||||
|
|
||||||
|
print('\n[green]----------------------------------------------------------------------[/green]')
|
||||||
|
|
||||||
|
from django.contrib.auth.models import User
|
||||||
|
|
||||||
|
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||||
|
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
|
||||||
|
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||||
|
|
||||||
|
if existing_index:
|
||||||
|
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
|
||||||
|
else:
|
||||||
|
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
|
||||||
|
|
||||||
|
json_index = out_dir / CONSTANTS.JSON_INDEX_FILENAME
|
||||||
|
html_index = out_dir / CONSTANTS.HTML_INDEX_FILENAME
|
||||||
|
index_name = f"{date.today()}_index_old"
|
||||||
|
if os.access(json_index, os.F_OK):
|
||||||
|
json_index.rename(f"{index_name}.json")
|
||||||
|
if os.access(html_index, os.F_OK):
|
||||||
|
html_index.rename(f"{index_name}.html")
|
||||||
|
|
||||||
|
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
from archivebox.config.common import STORAGE_CONFIG
|
||||||
|
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if install:
|
||||||
|
run_subcommand('install', pwd=out_dir)
|
||||||
|
|
||||||
|
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||||
|
print()
|
||||||
|
print(' [violet]Hint:[/violet] To view your archive index, run:')
|
||||||
|
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
|
||||||
|
print()
|
||||||
|
print(' To add new links, you can run:')
|
||||||
|
print(" archivebox add < ~/some/path/to/list_of_links.txt")
|
||||||
|
print()
|
||||||
|
print(' For more usage and examples, run:')
|
||||||
|
print(' archivebox help')
|
||||||
|
|
||||||
|
|
||||||
@docstring(init.__doc__)
|
@docstring(init.__doc__)
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
__package__ = 'archivebox.cli'
|
__package__ = 'archivebox.cli'
|
||||||
__command__ = 'archivebox install'
|
__command__ = 'archivebox install'
|
||||||
|
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,11 +12,145 @@ from typing import Optional, List, IO
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||||
from ..main import install
|
|
||||||
|
|
||||||
|
def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
|
||||||
|
"""Automatically install all ArchiveBox dependencies and extras"""
|
||||||
|
|
||||||
|
# if running as root:
|
||||||
|
# - run init to create index + lib dir
|
||||||
|
# - chown -R 911 DATA_DIR
|
||||||
|
# - install all binaries as root
|
||||||
|
# - chown -R 911 LIB_DIR
|
||||||
|
# else:
|
||||||
|
# - run init to create index + lib dir as current user
|
||||||
|
# - install all binaries as current user
|
||||||
|
# - recommend user re-run with sudo if any deps need to be installed as root
|
||||||
|
|
||||||
|
from rich import print
|
||||||
|
|
||||||
|
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||||
|
from archivebox.config.paths import get_or_create_working_lib_dir
|
||||||
|
|
||||||
|
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
|
||||||
|
run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in
|
||||||
|
|
||||||
|
print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
|
||||||
|
|
||||||
|
# we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
|
||||||
|
if IS_ROOT:
|
||||||
|
EUID = os.geteuid()
|
||||||
|
|
||||||
|
# if we have sudo/root permissions, take advantage of them just while installing dependencies
|
||||||
|
print()
|
||||||
|
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
|
||||||
|
print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
|
||||||
|
print()
|
||||||
|
|
||||||
|
LIB_DIR = get_or_create_working_lib_dir()
|
||||||
|
|
||||||
|
package_manager_names = ', '.join(
|
||||||
|
f'[yellow]{binprovider.name}[/yellow]'
|
||||||
|
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
|
||||||
|
if not binproviders or (binproviders and binprovider.name in binproviders)
|
||||||
|
)
|
||||||
|
print(f'[+] Setting up package managers {package_manager_names}...')
|
||||||
|
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
|
||||||
|
if binproviders and binprovider.name not in binproviders:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
binprovider.setup()
|
||||||
|
except Exception:
|
||||||
|
# it's ok, installing binaries below will automatically set up package managers as needed
|
||||||
|
# e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
|
||||||
|
# the next package that depends on npm will automatically call binprovider.setup() during its own install
|
||||||
|
pass
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
|
||||||
|
if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
|
||||||
|
# obviously must already be installed if we are running
|
||||||
|
continue
|
||||||
|
|
||||||
|
if binaries and binary.name not in binaries:
|
||||||
|
continue
|
||||||
|
|
||||||
|
providers = ' [grey53]or[/grey53] '.join(
|
||||||
|
provider.name for provider in binary.binproviders_supported
|
||||||
|
if not binproviders or (binproviders and provider.name in binproviders)
|
||||||
|
)
|
||||||
|
if not providers:
|
||||||
|
continue
|
||||||
|
print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
|
||||||
|
try:
|
||||||
|
with SudoPermission(uid=0, fallback=True):
|
||||||
|
# print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
|
||||||
|
if binproviders:
|
||||||
|
providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
|
||||||
|
for binprovider_name in binproviders:
|
||||||
|
if binprovider_name not in providers_supported_by_binary:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
if dry_run:
|
||||||
|
# always show install commands when doing a dry run
|
||||||
|
sys.stderr.write("\033[2;49;90m") # grey53
|
||||||
|
result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||||
|
sys.stderr.write("\033[00m\n") # reset
|
||||||
|
else:
|
||||||
|
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
|
||||||
|
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||||
|
if result and result['loaded_version']:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
|
||||||
|
else:
|
||||||
|
if dry_run:
|
||||||
|
sys.stderr.write("\033[2;49;90m") # grey53
|
||||||
|
binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||||
|
sys.stderr.write("\033[00m\n") # reset
|
||||||
|
else:
|
||||||
|
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
|
||||||
|
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||||
|
if IS_ROOT and LIB_DIR:
|
||||||
|
with SudoPermission(uid=0):
|
||||||
|
if ARCHIVEBOX_USER == 0:
|
||||||
|
os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
|
||||||
|
else:
|
||||||
|
os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
|
||||||
|
except Exception as e:
|
||||||
|
print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
|
||||||
|
if binaries and len(binaries) == 1:
|
||||||
|
# if we are only installing a single binary, raise the exception so the user can see what went wrong
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
from django.contrib.auth import get_user_model
|
||||||
|
User = get_user_model()
|
||||||
|
|
||||||
|
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||||
|
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||||
|
stderr(' archivebox manage createsuperuser')
|
||||||
|
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||||
|
|
||||||
|
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
|
||||||
|
|
||||||
|
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||||
|
|
||||||
|
extra_args = []
|
||||||
|
if binproviders:
|
||||||
|
extra_args.append(f'--binproviders={",".join(binproviders)}')
|
||||||
|
if binaries:
|
||||||
|
extra_args.append(f'--binaries={",".join(binaries)}')
|
||||||
|
|
||||||
|
proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=out_dir)
|
||||||
|
raise SystemExit(proc.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(install.__doc__)
|
@docstring(install.__doc__)
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog=__command__,
|
prog=__command__,
|
||||||
description=install.__doc__,
|
description=install.__doc__,
|
||||||
|
|
|
@ -1,139 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
__package__ = 'archivebox.cli'
|
|
||||||
__command__ = 'archivebox list'
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional, List, IO
|
|
||||||
|
|
||||||
from archivebox.config import DATA_DIR
|
|
||||||
from archivebox.misc.util import docstring
|
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin, stderr
|
|
||||||
from ..main import list_all
|
|
||||||
from ..index import (
|
|
||||||
LINK_FILTERS,
|
|
||||||
get_indexed_folders,
|
|
||||||
get_archived_folders,
|
|
||||||
get_unarchived_folders,
|
|
||||||
get_present_folders,
|
|
||||||
get_valid_folders,
|
|
||||||
get_invalid_folders,
|
|
||||||
get_duplicate_folders,
|
|
||||||
get_orphaned_folders,
|
|
||||||
get_corrupted_folders,
|
|
||||||
get_unrecognized_folders,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(list_all.__doc__)
|
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog=__command__,
|
|
||||||
description=list_all.__doc__,
|
|
||||||
add_help=True,
|
|
||||||
formatter_class=SmartFormatter,
|
|
||||||
)
|
|
||||||
group = parser.add_mutually_exclusive_group()
|
|
||||||
group.add_argument(
|
|
||||||
'--csv', #'-c',
|
|
||||||
type=str,
|
|
||||||
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
group.add_argument(
|
|
||||||
'--json', #'-j',
|
|
||||||
action='store_true',
|
|
||||||
help="Print the output in JSON format with all columns included",
|
|
||||||
)
|
|
||||||
group.add_argument(
|
|
||||||
'--html',
|
|
||||||
action='store_true',
|
|
||||||
help="Print the output in HTML format"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--with-headers',
|
|
||||||
action='store_true',
|
|
||||||
help='Include the headers in the output document'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--sort', #'-s',
|
|
||||||
type=str,
|
|
||||||
help="List the links sorted using the given key, e.g. timestamp or updated",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--before', #'-b',
|
|
||||||
type=float,
|
|
||||||
help="List only links bookmarked before (less than) the given timestamp",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--after', #'-a',
|
|
||||||
type=float,
|
|
||||||
help="List only links bookmarked after (greater than or equal to) the given timestamp",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--status',
|
|
||||||
type=str,
|
|
||||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
|
||||||
default='indexed',
|
|
||||||
help=(
|
|
||||||
'List only links or data directories that have the given status\n'
|
|
||||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
|
||||||
f' archived {get_archived_folders.__doc__}\n'
|
|
||||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
|
||||||
'\n'
|
|
||||||
f' present {get_present_folders.__doc__}\n'
|
|
||||||
f' valid {get_valid_folders.__doc__}\n'
|
|
||||||
f' invalid {get_invalid_folders.__doc__}\n'
|
|
||||||
'\n'
|
|
||||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
|
||||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
|
||||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
|
||||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--filter-type', '-t',
|
|
||||||
type=str,
|
|
||||||
choices=(*LINK_FILTERS.keys(), 'search'),
|
|
||||||
default='exact',
|
|
||||||
help='Type of pattern matching to use when filtering URLs',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'filter_patterns',
|
|
||||||
nargs='*',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='List only URLs matching these filter patterns'
|
|
||||||
)
|
|
||||||
command = parser.parse_args(args or ())
|
|
||||||
reject_stdin(stdin)
|
|
||||||
|
|
||||||
if command.with_headers and not (command.json or command.html or command.csv):
|
|
||||||
stderr(
|
|
||||||
'[X] --with-headers can only be used with --json, --html or --csv options\n',
|
|
||||||
color='red',
|
|
||||||
)
|
|
||||||
raise SystemExit(2)
|
|
||||||
|
|
||||||
matching_folders = list_all(
|
|
||||||
filter_patterns=command.filter_patterns,
|
|
||||||
filter_type=command.filter_type,
|
|
||||||
status=command.status,
|
|
||||||
after=command.after,
|
|
||||||
before=command.before,
|
|
||||||
sort=command.sort,
|
|
||||||
csv=command.csv,
|
|
||||||
json=command.json,
|
|
||||||
html=command.html,
|
|
||||||
with_headers=command.with_headers,
|
|
||||||
out_dir=Path(pwd) if pwd else DATA_DIR,
|
|
||||||
)
|
|
||||||
raise SystemExit(not matching_folders)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
|
|
@ -9,7 +9,27 @@ from typing import Optional, List, IO
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from ..main import manage
|
|
||||||
|
|
||||||
|
|
||||||
|
# @enforce_types
|
||||||
|
def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
|
||||||
|
"""Run an ArchiveBox Django management command"""
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
from django.core.management import execute_from_command_line
|
||||||
|
|
||||||
|
if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY):
|
||||||
|
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||||
|
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||||
|
stderr('')
|
||||||
|
|
||||||
|
# import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
|
execute_from_command_line(['manage.py', *(args or ['help'])])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(manage.__doc__)
|
@docstring(manage.__doc__)
|
||||||
|
|
|
@ -1,73 +1,98 @@
|
||||||
#!/usr/bin/env python3
|
# #!/usr/bin/env python3
|
||||||
|
|
||||||
__package__ = 'archivebox.cli'
|
################## DEPRECATED IN FAVOR OF abx-dl #####################
|
||||||
__command__ = 'archivebox oneshot'
|
# https://github.com/ArchiveBox/abx-dl
|
||||||
|
|
||||||
import sys
|
# __package__ = 'archivebox.cli'
|
||||||
import argparse
|
# __command__ = 'archivebox oneshot'
|
||||||
|
|
||||||
from pathlib import Path
|
# import sys
|
||||||
from typing import List, Optional, IO
|
# import argparse
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
# from pathlib import Path
|
||||||
from archivebox.config import DATA_DIR
|
# from typing import List, Optional, IO
|
||||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
|
|
||||||
from ..main import oneshot
|
# from archivebox.misc.util import docstring
|
||||||
|
# from archivebox.config import DATA_DIR
|
||||||
|
# from archivebox.misc.logging_util import SmartFormatter, accept_stdin, stderr
|
||||||
|
|
||||||
|
|
||||||
@docstring(oneshot.__doc__)
|
# @enforce_types
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
# def oneshot(url: str, extractors: str="", out_dir: Path=DATA_DIR, created_by_id: int | None=None) -> List[Link]:
|
||||||
parser = argparse.ArgumentParser(
|
# """
|
||||||
prog=__command__,
|
# Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
|
||||||
description=oneshot.__doc__,
|
# You can run this to archive single pages without needing to create a whole collection with archivebox init.
|
||||||
add_help=True,
|
# """
|
||||||
formatter_class=SmartFormatter,
|
# oneshot_link, _ = parse_links_memory([url])
|
||||||
)
|
# if len(oneshot_link) > 1:
|
||||||
parser.add_argument(
|
# stderr(
|
||||||
'url',
|
# '[X] You should pass a single url to the oneshot command',
|
||||||
type=str,
|
# color='red'
|
||||||
default=None,
|
# )
|
||||||
help=(
|
# raise SystemExit(2)
|
||||||
'URLs or paths to archive e.g.:\n'
|
|
||||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
|
||||||
' https://example.com/some/rss/feed.xml\n'
|
|
||||||
' https://example.com\n'
|
|
||||||
' ~/Downloads/firefox_bookmarks_export.html\n'
|
|
||||||
' ~/Desktop/sites_list.csv\n'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--extract",
|
|
||||||
type=str,
|
|
||||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
|
||||||
This does not take precedence over the configuration",
|
|
||||||
default=""
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--out-dir',
|
|
||||||
type=str,
|
|
||||||
default=DATA_DIR,
|
|
||||||
help= "Path to save the single archive folder to, e.g. ./example.com_archive"
|
|
||||||
)
|
|
||||||
command = parser.parse_args(args or ())
|
|
||||||
stdin_url = None
|
|
||||||
url = command.url
|
|
||||||
if not url:
|
|
||||||
stdin_url = accept_stdin(stdin)
|
|
||||||
|
|
||||||
if (stdin_url and url) or (not stdin and not url):
|
# methods = extractors.split(",") if extractors else ignore_methods(['title'])
|
||||||
stderr(
|
# archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, created_by_id=created_by_id)
|
||||||
'[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
|
# return oneshot_link
|
||||||
color='red',
|
|
||||||
)
|
|
||||||
raise SystemExit(2)
|
|
||||||
|
|
||||||
oneshot(
|
|
||||||
url=stdin_url or url,
|
|
||||||
out_dir=Path(command.out_dir).resolve(),
|
|
||||||
extractors=command.extract,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
|
||||||
|
|
||||||
|
|
||||||
|
# @docstring(oneshot.__doc__)
|
||||||
|
# def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||||
|
# parser = argparse.ArgumentParser(
|
||||||
|
# prog=__command__,
|
||||||
|
# description=oneshot.__doc__,
|
||||||
|
# add_help=True,
|
||||||
|
# formatter_class=SmartFormatter,
|
||||||
|
# )
|
||||||
|
# parser.add_argument(
|
||||||
|
# 'url',
|
||||||
|
# type=str,
|
||||||
|
# default=None,
|
||||||
|
# help=(
|
||||||
|
# 'URLs or paths to archive e.g.:\n'
|
||||||
|
# ' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||||
|
# ' https://example.com/some/rss/feed.xml\n'
|
||||||
|
# ' https://example.com\n'
|
||||||
|
# ' ~/Downloads/firefox_bookmarks_export.html\n'
|
||||||
|
# ' ~/Desktop/sites_list.csv\n'
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
# parser.add_argument(
|
||||||
|
# "--extract",
|
||||||
|
# type=str,
|
||||||
|
# help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||||
|
# This does not take precedence over the configuration",
|
||||||
|
# default=""
|
||||||
|
# )
|
||||||
|
# parser.add_argument(
|
||||||
|
# '--out-dir',
|
||||||
|
# type=str,
|
||||||
|
# default=DATA_DIR,
|
||||||
|
# help= "Path to save the single archive folder to, e.g. ./example.com_archive"
|
||||||
|
# )
|
||||||
|
# command = parser.parse_args(args or ())
|
||||||
|
# stdin_url = None
|
||||||
|
# url = command.url
|
||||||
|
# if not url:
|
||||||
|
# stdin_url = accept_stdin(stdin)
|
||||||
|
|
||||||
|
# if (stdin_url and url) or (not stdin and not url):
|
||||||
|
# stderr(
|
||||||
|
# '[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
|
||||||
|
# color='red',
|
||||||
|
# )
|
||||||
|
# raise SystemExit(2)
|
||||||
|
|
||||||
|
# oneshot(
|
||||||
|
# url=stdin_url or url,
|
||||||
|
# out_dir=Path(command.out_dir).resolve(),
|
||||||
|
# extractors=command.extract,
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# main(args=sys.argv[1:], stdin=sys.stdin)
|
||||||
|
|
|
@ -8,10 +8,93 @@ import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||||
from ..main import remove
|
from archivebox.index.schema import Link
|
||||||
|
|
||||||
|
|
||||||
|
def remove(filter_str: Optional[str]=None,
|
||||||
|
filter_patterns: Optional[list[str]]=None,
|
||||||
|
filter_type: str='exact',
|
||||||
|
snapshots: Optional[QuerySet]=None,
|
||||||
|
after: Optional[float]=None,
|
||||||
|
before: Optional[float]=None,
|
||||||
|
yes: bool=False,
|
||||||
|
delete: bool=False,
|
||||||
|
out_dir: Path=DATA_DIR) -> list[Link]:
|
||||||
|
"""Remove the specified URLs from the archive"""
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
|
||||||
|
if snapshots is None:
|
||||||
|
if filter_str and filter_patterns:
|
||||||
|
stderr(
|
||||||
|
'[X] You should pass either a pattern as an argument, '
|
||||||
|
'or pass a list of patterns via stdin, but not both.\n',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
raise SystemExit(2)
|
||||||
|
elif not (filter_str or filter_patterns):
|
||||||
|
stderr(
|
||||||
|
'[X] You should pass either a pattern as an argument, '
|
||||||
|
'or pass a list of patterns via stdin.',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
stderr()
|
||||||
|
hint(('To remove all urls you can run:',
|
||||||
|
'archivebox remove --filter-type=regex ".*"'))
|
||||||
|
stderr()
|
||||||
|
raise SystemExit(2)
|
||||||
|
elif filter_str:
|
||||||
|
filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
|
||||||
|
|
||||||
|
list_kwargs = {
|
||||||
|
"filter_patterns": filter_patterns,
|
||||||
|
"filter_type": filter_type,
|
||||||
|
"after": after,
|
||||||
|
"before": before,
|
||||||
|
}
|
||||||
|
if snapshots:
|
||||||
|
list_kwargs["snapshots"] = snapshots
|
||||||
|
|
||||||
|
log_list_started(filter_patterns, filter_type)
|
||||||
|
timer = TimedProgress(360, prefix=' ')
|
||||||
|
try:
|
||||||
|
snapshots = list_links(**list_kwargs)
|
||||||
|
finally:
|
||||||
|
timer.end()
|
||||||
|
|
||||||
|
|
||||||
|
if not snapshots.exists():
|
||||||
|
log_removal_finished(0, 0)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
|
log_links = [link.as_link() for link in snapshots]
|
||||||
|
log_list_finished(log_links)
|
||||||
|
log_removal_started(log_links, yes=yes, delete=delete)
|
||||||
|
|
||||||
|
timer = TimedProgress(360, prefix=' ')
|
||||||
|
try:
|
||||||
|
for snapshot in snapshots:
|
||||||
|
if delete:
|
||||||
|
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
|
||||||
|
finally:
|
||||||
|
timer.end()
|
||||||
|
|
||||||
|
to_remove = snapshots.count()
|
||||||
|
|
||||||
|
from .search import flush_search_index
|
||||||
|
|
||||||
|
flush_search_index(snapshots=snapshots)
|
||||||
|
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
||||||
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
|
log_removal_finished(all_snapshots.count(), to_remove)
|
||||||
|
|
||||||
|
return all_snapshots
|
||||||
|
|
||||||
|
|
||||||
@docstring(remove.__doc__)
|
@docstring(remove.__doc__)
|
||||||
|
|
|
@ -11,7 +11,139 @@ from typing import Optional, List, IO
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||||
from ..main import schedule
|
from archivebox.config.common import ARCHIVING_CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
# @enforce_types
|
||||||
|
def schedule(add: bool=False,
|
||||||
|
show: bool=False,
|
||||||
|
clear: bool=False,
|
||||||
|
foreground: bool=False,
|
||||||
|
run_all: bool=False,
|
||||||
|
quiet: bool=False,
|
||||||
|
every: Optional[str]=None,
|
||||||
|
tag: str='',
|
||||||
|
depth: int=0,
|
||||||
|
overwrite: bool=False,
|
||||||
|
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||||
|
import_path: Optional[str]=None,
|
||||||
|
out_dir: Path=DATA_DIR):
|
||||||
|
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||||
|
from archivebox.config.permissions import USER
|
||||||
|
|
||||||
|
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
cron = CronTab(user=True)
|
||||||
|
cron = dedupe_cron_jobs(cron)
|
||||||
|
|
||||||
|
if clear:
|
||||||
|
print(cron.remove_all(comment=CRON_COMMENT))
|
||||||
|
cron.write()
|
||||||
|
raise SystemExit(0)
|
||||||
|
|
||||||
|
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||||
|
|
||||||
|
if every or add:
|
||||||
|
every = every or 'day'
|
||||||
|
quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s)
|
||||||
|
cmd = [
|
||||||
|
'cd',
|
||||||
|
quoted(out_dir),
|
||||||
|
'&&',
|
||||||
|
quoted(ARCHIVEBOX_BINARY.load().abspath),
|
||||||
|
*([
|
||||||
|
'add',
|
||||||
|
*(['--overwrite'] if overwrite else []),
|
||||||
|
*(['--update'] if update else []),
|
||||||
|
*([f'--tag={tag}'] if tag else []),
|
||||||
|
f'--depth={depth}',
|
||||||
|
f'"{import_path}"',
|
||||||
|
] if import_path else ['update']),
|
||||||
|
'>>',
|
||||||
|
quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
|
||||||
|
'2>&1',
|
||||||
|
|
||||||
|
]
|
||||||
|
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
|
||||||
|
|
||||||
|
if every in ('minute', 'hour', 'day', 'month', 'year'):
|
||||||
|
set_every = getattr(new_job.every(), every)
|
||||||
|
set_every()
|
||||||
|
elif CronSlices.is_valid(every):
|
||||||
|
new_job.setall(every)
|
||||||
|
else:
|
||||||
|
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
stderr(' It must be one of minute/hour/day/month')
|
||||||
|
stderr(' or a quoted cron-format schedule like:')
|
||||||
|
stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
|
||||||
|
stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
cron = dedupe_cron_jobs(cron)
|
||||||
|
cron.write()
|
||||||
|
|
||||||
|
total_runs = sum(j.frequency_per_year() for j in cron)
|
||||||
|
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||||
|
|
||||||
|
print()
|
||||||
|
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
|
||||||
|
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
|
||||||
|
if total_runs > 60 and not quiet:
|
||||||
|
stderr()
|
||||||
|
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
|
||||||
|
stderr(' Congrats on being an enthusiastic internet archiver! 👌')
|
||||||
|
stderr()
|
||||||
|
stderr(' Make sure you have enough storage space available to hold all the data.')
|
||||||
|
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
|
||||||
|
stderr('')
|
||||||
|
elif show:
|
||||||
|
if existing_jobs:
|
||||||
|
print('\n'.join(str(cmd) for cmd in existing_jobs))
|
||||||
|
else:
|
||||||
|
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
|
||||||
|
stderr(' To schedule a new job, run:')
|
||||||
|
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
|
||||||
|
raise SystemExit(0)
|
||||||
|
|
||||||
|
cron = CronTab(user=True)
|
||||||
|
cron = dedupe_cron_jobs(cron)
|
||||||
|
existing_jobs = list(cron.find_comment(CRON_COMMENT))
|
||||||
|
|
||||||
|
if foreground or run_all:
|
||||||
|
if not existing_jobs:
|
||||||
|
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
|
||||||
|
if run_all:
|
||||||
|
try:
|
||||||
|
for job in existing_jobs:
|
||||||
|
sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n')
|
||||||
|
sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
|
||||||
|
sys.stdout.flush()
|
||||||
|
job.run()
|
||||||
|
sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n')
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if foreground:
|
||||||
|
try:
|
||||||
|
for job in existing_jobs:
|
||||||
|
print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
|
||||||
|
for result in cron.run_scheduler():
|
||||||
|
print(result)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
# if CAN_UPGRADE:
|
||||||
|
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(schedule.__doc__)
|
@docstring(schedule.__doc__)
|
||||||
|
|
164
archivebox/cli/archivebox_search.py
Normal file
164
archivebox/cli/archivebox_search.py
Normal file
|
@ -0,0 +1,164 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
__package__ = 'archivebox.cli'
|
||||||
|
__command__ = 'archivebox search'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, List, Iterable
|
||||||
|
|
||||||
|
import rich_click as click
|
||||||
|
from rich import print
|
||||||
|
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
|
from archivebox.config import DATA_DIR
|
||||||
|
from archivebox.index import LINK_FILTERS
|
||||||
|
from archivebox.index.schema import Link
|
||||||
|
from archivebox.misc.logging import stderr
|
||||||
|
from archivebox.misc.util import enforce_types, docstring
|
||||||
|
|
||||||
|
STATUS_CHOICES = [
|
||||||
|
'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
|
||||||
|
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def list_links(snapshots: Optional[QuerySet]=None,
|
||||||
|
filter_patterns: Optional[List[str]]=None,
|
||||||
|
filter_type: str='substring',
|
||||||
|
after: Optional[float]=None,
|
||||||
|
before: Optional[float]=None,
|
||||||
|
out_dir: Path=DATA_DIR) -> Iterable[Link]:
|
||||||
|
|
||||||
|
from archivebox.index import load_main_index
|
||||||
|
from archivebox.index import snapshot_filter
|
||||||
|
|
||||||
|
if snapshots:
|
||||||
|
all_snapshots = snapshots
|
||||||
|
else:
|
||||||
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
|
if after is not None:
|
||||||
|
all_snapshots = all_snapshots.filter(timestamp__gte=after)
|
||||||
|
if before is not None:
|
||||||
|
all_snapshots = all_snapshots.filter(timestamp__lt=before)
|
||||||
|
if filter_patterns:
|
||||||
|
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
|
||||||
|
|
||||||
|
if not all_snapshots:
|
||||||
|
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||||
|
|
||||||
|
return all_snapshots
|
||||||
|
|
||||||
|
|
||||||
|
def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
|
||||||
|
|
||||||
|
from archivebox.misc.checks import check_data_folder
|
||||||
|
from archivebox.index import (
|
||||||
|
get_indexed_folders,
|
||||||
|
get_archived_folders,
|
||||||
|
get_unarchived_folders,
|
||||||
|
get_present_folders,
|
||||||
|
get_valid_folders,
|
||||||
|
get_invalid_folders,
|
||||||
|
get_duplicate_folders,
|
||||||
|
get_orphaned_folders,
|
||||||
|
get_corrupted_folders,
|
||||||
|
get_unrecognized_folders,
|
||||||
|
)
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
|
||||||
|
STATUS_FUNCTIONS = {
|
||||||
|
"indexed": get_indexed_folders,
|
||||||
|
"archived": get_archived_folders,
|
||||||
|
"unarchived": get_unarchived_folders,
|
||||||
|
"present": get_present_folders,
|
||||||
|
"valid": get_valid_folders,
|
||||||
|
"invalid": get_invalid_folders,
|
||||||
|
"duplicate": get_duplicate_folders,
|
||||||
|
"orphaned": get_orphaned_folders,
|
||||||
|
"corrupted": get_corrupted_folders,
|
||||||
|
"unrecognized": get_unrecognized_folders,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError('Status not recognized.')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def search(filter_patterns: list[str] | None=None,
|
||||||
|
filter_type: str='substring',
|
||||||
|
status: str='indexed',
|
||||||
|
before: float | None=None,
|
||||||
|
after: float | None=None,
|
||||||
|
sort: str | None=None,
|
||||||
|
json: bool=False,
|
||||||
|
html: bool=False,
|
||||||
|
csv: str | None=None,
|
||||||
|
with_headers: bool=False):
|
||||||
|
"""List, filter, and export information about archive entries"""
|
||||||
|
|
||||||
|
|
||||||
|
if with_headers and not (json or html or csv):
|
||||||
|
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||||
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
snapshots = list_links(
|
||||||
|
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
||||||
|
filter_type=filter_type,
|
||||||
|
before=before,
|
||||||
|
after=after,
|
||||||
|
)
|
||||||
|
|
||||||
|
if sort:
|
||||||
|
snapshots = snapshots.order_by(sort)
|
||||||
|
|
||||||
|
folders = list_folders(
|
||||||
|
links=snapshots,
|
||||||
|
status=status,
|
||||||
|
out_dir=DATA_DIR,
|
||||||
|
)
|
||||||
|
|
||||||
|
if json:
|
||||||
|
from archivebox.index.json import generate_json_index_from_links
|
||||||
|
output = generate_json_index_from_links(folders.values(), with_headers)
|
||||||
|
elif html:
|
||||||
|
from archivebox.index.html import generate_index_from_links
|
||||||
|
output = generate_index_from_links(folders.values(), with_headers)
|
||||||
|
elif csv:
|
||||||
|
from archivebox.index.csv import links_to_csv
|
||||||
|
output = links_to_csv(folders.values(), csv.split(','), with_headers)
|
||||||
|
else:
|
||||||
|
from archivebox.misc.logging_util import printable_folders
|
||||||
|
output = printable_folders(folders, with_headers)
|
||||||
|
|
||||||
|
print(output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
|
||||||
|
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
|
||||||
|
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
|
||||||
|
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
|
||||||
|
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||||
|
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
|
||||||
|
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
|
||||||
|
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
|
||||||
|
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
|
||||||
|
@click.help_option('--help', '-h')
|
||||||
|
@click.argument('filter_patterns', nargs=-1)
|
||||||
|
@docstring(search.__doc__)
|
||||||
|
def main(**kwargs):
|
||||||
|
return search(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -12,7 +12,81 @@ from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.config.common import SERVER_CONFIG
|
from archivebox.config.common import SERVER_CONFIG
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||||
from ..main import server
|
|
||||||
|
|
||||||
|
|
||||||
|
# @enforce_types
|
||||||
|
def server(runserver_args: Optional[List[str]]=None,
|
||||||
|
reload: bool=False,
|
||||||
|
debug: bool=False,
|
||||||
|
init: bool=False,
|
||||||
|
quick_init: bool=False,
|
||||||
|
createsuperuser: bool=False,
|
||||||
|
daemonize: bool=False,
|
||||||
|
out_dir: Path=DATA_DIR) -> None:
|
||||||
|
"""Run the ArchiveBox HTTP server"""
|
||||||
|
|
||||||
|
from rich import print
|
||||||
|
|
||||||
|
runserver_args = runserver_args or []
|
||||||
|
|
||||||
|
if init:
|
||||||
|
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||||
|
print()
|
||||||
|
elif quick_init:
|
||||||
|
run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
|
||||||
|
print()
|
||||||
|
|
||||||
|
if createsuperuser:
|
||||||
|
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
|
||||||
|
from django.core.management import call_command
|
||||||
|
from django.contrib.auth.models import User
|
||||||
|
|
||||||
|
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||||
|
print()
|
||||||
|
# print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]')
|
||||||
|
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
|
||||||
|
print(' [green]archivebox manage createsuperuser[/green]')
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
host = '127.0.0.1'
|
||||||
|
port = '8000'
|
||||||
|
|
||||||
|
try:
|
||||||
|
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
|
||||||
|
if ':' in host_and_port:
|
||||||
|
host, port = host_and_port.split(':')
|
||||||
|
else:
|
||||||
|
if '.' in host_and_port:
|
||||||
|
host = host_and_port
|
||||||
|
else:
|
||||||
|
port = host_and_port
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||||
|
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||||
|
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||||
|
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||||
|
|
||||||
|
if SHELL_CONFIG.DEBUG:
|
||||||
|
if not reload:
|
||||||
|
runserver_args.append('--noreload') # '--insecure'
|
||||||
|
call_command("runserver", *runserver_args)
|
||||||
|
else:
|
||||||
|
from workers.supervisord_util import start_server_workers
|
||||||
|
|
||||||
|
print()
|
||||||
|
start_server_workers(host=host, port=port, daemonize=False)
|
||||||
|
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(server.__doc__)
|
@docstring(server.__doc__)
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||||
|
|
|
@ -11,7 +11,19 @@ from typing import Optional, List, IO
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||||
from ..main import shell
|
|
||||||
|
|
||||||
|
|
||||||
|
#@enforce_types
|
||||||
|
def shell(out_dir: Path=DATA_DIR) -> None:
|
||||||
|
"""Enter an interactive ArchiveBox Django shell"""
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
|
||||||
|
from django.core.management import call_command
|
||||||
|
call_command("shell_plus")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(shell.__doc__)
|
@docstring(shell.__doc__)
|
||||||
|
|
|
@ -8,10 +8,114 @@ import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
|
from rich import print
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
||||||
from ..main import status
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# @enforce_types
|
||||||
|
def status(out_dir: Path=DATA_DIR) -> None:
|
||||||
|
"""Print out some info and statistics about the archive collection"""
|
||||||
|
|
||||||
|
check_data_folder()
|
||||||
|
|
||||||
|
from core.models import Snapshot
|
||||||
|
from django.contrib.auth import get_user_model
|
||||||
|
User = get_user_model()
|
||||||
|
|
||||||
|
print('{green}[*] Scanning archive main index...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
print(SHELL_CONFIG.ANSI['lightyellow'], f' {out_dir}/*', SHELL_CONFIG.ANSI['reset'])
|
||||||
|
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
|
||||||
|
size = printable_filesize(num_bytes)
|
||||||
|
print(f' Index size: {size} across {num_files} files')
|
||||||
|
print()
|
||||||
|
|
||||||
|
links = load_main_index(out_dir=out_dir)
|
||||||
|
num_sql_links = links.count()
|
||||||
|
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||||
|
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
|
||||||
|
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
|
||||||
|
print()
|
||||||
|
print('{green}[*] Scanning archive data directories...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
print(SHELL_CONFIG.ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', SHELL_CONFIG.ANSI['reset'])
|
||||||
|
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
|
||||||
|
size = printable_filesize(num_bytes)
|
||||||
|
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||||
|
print(SHELL_CONFIG.ANSI['black'])
|
||||||
|
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
|
||||||
|
num_archived = len(get_archived_folders(links, out_dir=out_dir))
|
||||||
|
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
|
||||||
|
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
|
||||||
|
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
|
||||||
|
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
|
||||||
|
|
||||||
|
num_present = len(get_present_folders(links, out_dir=out_dir))
|
||||||
|
num_valid = len(get_valid_folders(links, out_dir=out_dir))
|
||||||
|
print()
|
||||||
|
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
|
||||||
|
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
|
||||||
|
|
||||||
|
duplicate = get_duplicate_folders(links, out_dir=out_dir)
|
||||||
|
orphaned = get_orphaned_folders(links, out_dir=out_dir)
|
||||||
|
corrupted = get_corrupted_folders(links, out_dir=out_dir)
|
||||||
|
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
|
||||||
|
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
|
||||||
|
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
|
||||||
|
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
|
||||||
|
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
|
||||||
|
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
|
||||||
|
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
|
||||||
|
|
||||||
|
print(SHELL_CONFIG.ANSI['reset'])
|
||||||
|
|
||||||
|
if num_indexed:
|
||||||
|
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
|
||||||
|
|
||||||
|
if orphaned:
|
||||||
|
print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
print(' archivebox init')
|
||||||
|
|
||||||
|
if num_invalid:
|
||||||
|
print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
print(' archivebox init')
|
||||||
|
|
||||||
|
print()
|
||||||
|
print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
print(SHELL_CONFIG.ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', SHELL_CONFIG.ANSI['reset'])
|
||||||
|
users = get_admins().values_list('username', flat=True)
|
||||||
|
print(f' UI users {len(users)}: {", ".join(users)}')
|
||||||
|
last_login = User.objects.order_by('last_login').last()
|
||||||
|
if last_login:
|
||||||
|
print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
|
||||||
|
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
|
||||||
|
if last_downloaded:
|
||||||
|
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
|
||||||
|
|
||||||
|
if not users:
|
||||||
|
print()
|
||||||
|
print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**SHELL_CONFIG.ANSI))
|
||||||
|
print(' archivebox manage createsuperuser')
|
||||||
|
|
||||||
|
print()
|
||||||
|
for snapshot in links.order_by('-downloaded_at')[:10]:
|
||||||
|
if not snapshot.downloaded_at:
|
||||||
|
continue
|
||||||
|
print(
|
||||||
|
SHELL_CONFIG.ANSI['black'],
|
||||||
|
(
|
||||||
|
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||||
|
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||||
|
f'"{snapshot.title}": {snapshot.url}'
|
||||||
|
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||||
|
SHELL_CONFIG.ANSI['reset'],
|
||||||
|
)
|
||||||
|
print(SHELL_CONFIG.ANSI['black'], ' ...', SHELL_CONFIG.ANSI['reset'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(status.__doc__)
|
@docstring(status.__doc__)
|
||||||
|
|
|
@ -24,7 +24,92 @@ from archivebox.index import (
|
||||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||||
# from ..main import update
|
# from ..main import update
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# LEGACY VERSION:
|
||||||
|
# @enforce_types
|
||||||
|
# def update(resume: Optional[float]=None,
|
||||||
|
# only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
|
||||||
|
# index_only: bool=False,
|
||||||
|
# overwrite: bool=False,
|
||||||
|
# filter_patterns_str: Optional[str]=None,
|
||||||
|
# filter_patterns: Optional[List[str]]=None,
|
||||||
|
# filter_type: Optional[str]=None,
|
||||||
|
# status: Optional[str]=None,
|
||||||
|
# after: Optional[str]=None,
|
||||||
|
# before: Optional[str]=None,
|
||||||
|
# extractors: str="",
|
||||||
|
# out_dir: Path=DATA_DIR) -> List[Link]:
|
||||||
|
# """Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
|
|
||||||
|
# from core.models import ArchiveResult
|
||||||
|
# from .search import index_links
|
||||||
|
# # from workers.supervisord_util import start_cli_workers
|
||||||
|
|
||||||
|
|
||||||
|
# check_data_folder()
|
||||||
|
# # start_cli_workers()
|
||||||
|
# new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||||
|
|
||||||
|
# extractors = extractors.split(",") if extractors else []
|
||||||
|
|
||||||
|
# # Step 1: Filter for selected_links
|
||||||
|
# print('[*] Finding matching Snapshots to update...')
|
||||||
|
# print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
|
||||||
|
# matching_snapshots = list_links(
|
||||||
|
# filter_patterns=filter_patterns,
|
||||||
|
# filter_type=filter_type,
|
||||||
|
# before=before,
|
||||||
|
# after=after,
|
||||||
|
# )
|
||||||
|
# print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
|
||||||
|
# matching_folders = list_folders(
|
||||||
|
# links=matching_snapshots,
|
||||||
|
# status=status,
|
||||||
|
# out_dir=out_dir,
|
||||||
|
# )
|
||||||
|
# all_links = (link for link in matching_folders.values() if link)
|
||||||
|
# print(' - Sorting by most unfinished -> least unfinished + date archived...')
|
||||||
|
# all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
|
||||||
|
|
||||||
|
# if index_only:
|
||||||
|
# for link in all_links:
|
||||||
|
# write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||||
|
# index_links(all_links, out_dir=out_dir)
|
||||||
|
# return all_links
|
||||||
|
|
||||||
|
# # Step 2: Run the archive methods for each link
|
||||||
|
# to_archive = new_links if only_new else all_links
|
||||||
|
# if resume:
|
||||||
|
# to_archive = [
|
||||||
|
# link for link in to_archive
|
||||||
|
# if link.timestamp >= str(resume)
|
||||||
|
# ]
|
||||||
|
# if not to_archive:
|
||||||
|
# stderr('')
|
||||||
|
# stderr(f'[√] Nothing found to resume after {resume}', color='green')
|
||||||
|
# return all_links
|
||||||
|
|
||||||
|
# archive_kwargs = {
|
||||||
|
# "out_dir": out_dir,
|
||||||
|
# }
|
||||||
|
# if extractors:
|
||||||
|
# archive_kwargs["methods"] = extractors
|
||||||
|
|
||||||
|
|
||||||
|
# archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||||
|
|
||||||
|
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
|
# all_links = load_main_index(out_dir=out_dir)
|
||||||
|
# return all_links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update():
|
def update():
|
||||||
|
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
from archivebox.config.django import setup_django
|
from archivebox.config.django import setup_django
|
||||||
setup_django()
|
setup_django()
|
||||||
|
|
||||||
|
|
|
@ -1,61 +1,207 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
__package__ = 'archivebox.cli'
|
__package__ = 'archivebox.cli'
|
||||||
__command__ = 'archivebox version'
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
from typing import Iterable
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional, List, IO
|
|
||||||
|
|
||||||
# from archivebox.misc.util import docstring
|
import rich_click as click
|
||||||
from archivebox.config import DATA_DIR, VERSION
|
|
||||||
from archivebox.misc.logging_util import SmartFormatter, reject_stdin
|
from archivebox.misc.util import docstring, enforce_types
|
||||||
|
|
||||||
|
|
||||||
# @docstring(version.__doc__)
|
@enforce_types
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
def version(quiet: bool=False,
|
||||||
"""Print the ArchiveBox version and dependency information"""
|
binproviders: Iterable[str]=(),
|
||||||
parser = argparse.ArgumentParser(
|
binaries: Iterable[str]=()) -> list[str]:
|
||||||
prog=__command__,
|
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
|
||||||
description="Print the ArchiveBox version and dependency information", # version.__doc__,
|
|
||||||
add_help=True,
|
|
||||||
formatter_class=SmartFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--quiet', '-q',
|
|
||||||
action='store_true',
|
|
||||||
help='Only print ArchiveBox version number and nothing else.',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--binproviders', '-p',
|
|
||||||
type=str,
|
|
||||||
help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)',
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--binaries', '-b',
|
|
||||||
type=str,
|
|
||||||
help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)',
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
command = parser.parse_args(args or ())
|
|
||||||
reject_stdin(__command__, stdin)
|
|
||||||
|
|
||||||
# for speed reasons, check if quiet flag was set and just return simple version immediately if so
|
# fast path for just getting the version and exiting, dont do any slower imports
|
||||||
if command.quiet:
|
from archivebox.config.version import VERSION
|
||||||
print(VERSION)
|
print(VERSION)
|
||||||
return
|
if quiet or '--version' in sys.argv:
|
||||||
|
return []
|
||||||
|
|
||||||
# otherwise do big expensive import to get the full version
|
# Only do slower imports when getting full version info
|
||||||
from ..main import version
|
import os
|
||||||
version(
|
import platform
|
||||||
quiet=command.quiet,
|
from pathlib import Path
|
||||||
out_dir=Path(pwd) if pwd else DATA_DIR,
|
|
||||||
binproviders=command.binproviders.split(',') if command.binproviders else None,
|
from rich.panel import Panel
|
||||||
binaries=command.binaries.split(',') if command.binaries else None,
|
from rich.console import Console
|
||||||
|
from abx_pkg import Binary
|
||||||
|
|
||||||
|
import abx
|
||||||
|
import archivebox
|
||||||
|
from archivebox.config import CONSTANTS, DATA_DIR
|
||||||
|
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||||
|
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
|
||||||
|
from archivebox.config.paths import get_data_locations, get_code_locations
|
||||||
|
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
||||||
|
from archivebox.misc.logging_util import printable_folder_status
|
||||||
|
|
||||||
|
from abx_plugin_default_binproviders import apt, brew, env
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
prnt = console.print
|
||||||
|
|
||||||
|
LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
|
||||||
|
|
||||||
|
# 0.7.1
|
||||||
|
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
|
||||||
|
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
|
||||||
|
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
|
||||||
|
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
|
||||||
|
|
||||||
|
p = platform.uname()
|
||||||
|
COMMIT_HASH = get_COMMIT_HASH()
|
||||||
|
prnt(
|
||||||
|
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
|
||||||
|
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||||
|
f'BUILD_TIME={get_BUILD_TIME()}',
|
||||||
)
|
)
|
||||||
|
prnt(
|
||||||
|
f'IN_DOCKER={IN_DOCKER}',
|
||||||
|
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
|
||||||
|
f'ARCH={p.machine}',
|
||||||
|
f'OS={p.system}',
|
||||||
|
f'PLATFORM={platform.platform()}',
|
||||||
|
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
|
||||||
|
)
|
||||||
|
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||||
|
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||||
|
prnt(
|
||||||
|
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||||
|
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
|
||||||
|
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||||
|
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||||
|
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||||
|
)
|
||||||
|
prnt(
|
||||||
|
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||||
|
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||||
|
f'SUDO={CONSTANTS.IS_ROOT}',
|
||||||
|
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
|
||||||
|
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||||
|
f'LDAP={LDAP_ENABLED}',
|
||||||
|
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||||
|
)
|
||||||
|
prnt()
|
||||||
|
|
||||||
|
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
|
||||||
|
PANEL_TEXT = '\n'.join((
|
||||||
|
# '',
|
||||||
|
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
|
||||||
|
'',
|
||||||
|
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
|
||||||
|
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
|
||||||
|
'',
|
||||||
|
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
|
||||||
|
'',
|
||||||
|
))
|
||||||
|
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||||
|
prnt()
|
||||||
|
return []
|
||||||
|
|
||||||
|
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||||
|
failures = []
|
||||||
|
BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
|
||||||
|
for name, binary in list(BINARIES.items()):
|
||||||
|
if binary.name == 'archivebox':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# skip if the binary is not in the requested list of binaries
|
||||||
|
if binaries and binary.name not in binaries:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# skip if the binary is not supported by any of the requested binproviders
|
||||||
|
if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
|
||||||
|
continue
|
||||||
|
|
||||||
|
err = None
|
||||||
|
try:
|
||||||
|
loaded_bin = binary.load()
|
||||||
|
except Exception as e:
|
||||||
|
err = e
|
||||||
|
loaded_bin = binary
|
||||||
|
provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
|
||||||
|
if loaded_bin.abspath:
|
||||||
|
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||||
|
if ' ' in abspath:
|
||||||
|
abspath = abspath.replace(' ', r'\ ')
|
||||||
|
else:
|
||||||
|
abspath = f'[red]{err}[/red]'
|
||||||
|
prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
|
||||||
|
if not loaded_bin.is_valid:
|
||||||
|
failures.append(loaded_bin.name)
|
||||||
|
|
||||||
|
prnt()
|
||||||
|
prnt('[gold3][i] Package Managers:[/gold3]')
|
||||||
|
BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
|
||||||
|
for name, binprovider in list(BINPROVIDERS.items()):
|
||||||
|
err = None
|
||||||
|
|
||||||
|
if binproviders and binprovider.name not in binproviders:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
|
||||||
|
loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
|
||||||
|
|
||||||
|
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||||
|
abspath = None
|
||||||
|
if loaded_bin.abspath:
|
||||||
|
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||||
|
if ' ' in abspath:
|
||||||
|
abspath = abspath.replace(' ', r'\ ')
|
||||||
|
|
||||||
|
PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||||
|
ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
|
||||||
|
provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
|
||||||
|
prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)
|
||||||
|
|
||||||
|
if not (binaries or binproviders):
|
||||||
|
# dont show source code / data dir info if we just want to get version info for a binary or binprovider
|
||||||
|
|
||||||
|
prnt()
|
||||||
|
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
|
||||||
|
for name, path in get_code_locations().items():
|
||||||
|
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||||
|
|
||||||
|
prnt()
|
||||||
|
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
|
||||||
|
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
|
||||||
|
for name, path in get_data_locations().items():
|
||||||
|
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||||
|
|
||||||
|
from archivebox.misc.checks import check_data_dir_permissions
|
||||||
|
|
||||||
|
check_data_dir_permissions()
|
||||||
|
else:
|
||||||
|
prnt()
|
||||||
|
prnt('[red][i] Data locations:[/red] (not in a data directory)')
|
||||||
|
|
||||||
|
prnt()
|
||||||
|
|
||||||
|
if failures:
|
||||||
|
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
|
||||||
|
prnt(f' [red]{", ".join(failures)}[/red]')
|
||||||
|
prnt()
|
||||||
|
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
|
||||||
|
prnt(' [green]archivebox install[/green]')
|
||||||
|
prnt()
|
||||||
|
return failures
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
|
||||||
|
@click.option('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
|
||||||
|
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
|
||||||
|
@docstring(version.__doc__)
|
||||||
|
def main(**kwargs):
|
||||||
|
failures = version(**kwargs)
|
||||||
|
if failures:
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
main()
|
||||||
|
|
|
@ -60,7 +60,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
||||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=False)
|
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
|
||||||
|
|
||||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
||||||
|
|
||||||
|
|
|
@ -142,7 +142,7 @@ def create_and_chown_dir(dir_path: Path) -> None:
|
||||||
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
|
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def get_or_create_working_tmp_dir(autofix=True, quiet=False):
|
def get_or_create_working_tmp_dir(autofix=True, quiet=True):
|
||||||
from archivebox import CONSTANTS
|
from archivebox import CONSTANTS
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
from archivebox.config.common import STORAGE_CONFIG
|
||||||
from archivebox.misc.checks import check_tmp_dir
|
from archivebox.misc.checks import check_tmp_dir
|
||||||
|
@ -165,7 +165,7 @@ def get_or_create_working_tmp_dir(autofix=True, quiet=False):
|
||||||
pass
|
pass
|
||||||
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
|
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
|
||||||
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
|
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
|
||||||
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet)
|
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
|
||||||
return candidate
|
return candidate
|
||||||
|
|
||||||
if not quiet:
|
if not quiet:
|
||||||
|
@ -193,7 +193,7 @@ def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
||||||
pass
|
pass
|
||||||
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
|
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
|
||||||
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
|
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
|
||||||
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet)
|
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
|
||||||
return candidate
|
return candidate
|
||||||
|
|
||||||
if not quiet:
|
if not quiet:
|
||||||
|
|
|
@ -36,6 +36,8 @@ HOSTNAME: str = max([socket.gethostname(), platform.node()], key=len)
|
||||||
|
|
||||||
IS_ROOT = RUNNING_AS_UID == 0
|
IS_ROOT = RUNNING_AS_UID == 0
|
||||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||||
|
# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose
|
||||||
|
|
||||||
|
|
||||||
FALLBACK_UID = RUNNING_AS_UID or SUDO_UID
|
FALLBACK_UID = RUNNING_AS_UID or SUDO_UID
|
||||||
FALLBACK_GID = RUNNING_AS_GID or SUDO_GID
|
FALLBACK_GID = RUNNING_AS_GID or SUDO_GID
|
||||||
|
|
|
@ -303,7 +303,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||||
"Exit Status": [],
|
"Exit Status": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
from workers.supervisor_util import get_existing_supervisord_process
|
from workers.supervisord_util import get_existing_supervisord_process
|
||||||
|
|
||||||
supervisor = get_existing_supervisord_process()
|
supervisor = get_existing_supervisord_process()
|
||||||
if supervisor is None:
|
if supervisor is None:
|
||||||
|
@ -373,7 +373,7 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||||
def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||||
assert request.user.is_superuser, "Must be a superuser to view configuration settings."
|
assert request.user.is_superuser, "Must be a superuser to view configuration settings."
|
||||||
|
|
||||||
from workers.supervisor_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME
|
from workers.supervisord_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME
|
||||||
|
|
||||||
SOCK_FILE = get_sock_file()
|
SOCK_FILE = get_sock_file()
|
||||||
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME
|
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME
|
||||||
|
|
|
@ -21,7 +21,6 @@ from archivebox.misc.logging_util import printable_filesize
|
||||||
from archivebox.search.admin import SearchResultsAdminMixin
|
from archivebox.search.admin import SearchResultsAdminMixin
|
||||||
from archivebox.index.html import snapshot_icons
|
from archivebox.index.html import snapshot_icons
|
||||||
from archivebox.extractors import archive_links
|
from archivebox.extractors import archive_links
|
||||||
from archivebox.main import remove
|
|
||||||
|
|
||||||
from archivebox.base_models.admin import ABIDModelAdmin
|
from archivebox.base_models.admin import ABIDModelAdmin
|
||||||
from archivebox.workers.tasks import bg_archive_links, bg_add
|
from archivebox.workers.tasks import bg_archive_links, bg_add
|
||||||
|
@ -321,7 +320,9 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
||||||
description="☠️ Delete"
|
description="☠️ Delete"
|
||||||
)
|
)
|
||||||
def delete_snapshots(self, request, queryset):
|
def delete_snapshots(self, request, queryset):
|
||||||
|
from archivebox.cli.archivebox_remove import remove
|
||||||
remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
|
remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
|
||||||
|
|
||||||
messages.success(
|
messages.success(
|
||||||
request,
|
request,
|
||||||
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
|
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
|
||||||
|
|
1526
archivebox/main.py
1526
archivebox/main.py
File diff suppressed because it is too large
Load diff
|
@ -24,7 +24,7 @@ def check_data_folder() -> None:
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||||
|
|
||||||
archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()
|
archive_dir_exists = os.path.isdir(ARCHIVE_DIR)
|
||||||
if not archive_dir_exists:
|
if not archive_dir_exists:
|
||||||
print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
|
print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
|
||||||
print(f' {DATA_DIR}', file=sys.stderr)
|
print(f' {DATA_DIR}', file=sys.stderr)
|
||||||
|
|
|
@ -12,7 +12,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
|
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ..index.schema import Link, ArchiveResult
|
from ..index.schema import Link, ArchiveResult
|
||||||
|
@ -228,7 +228,7 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str='.'):
|
def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'):
|
||||||
args = ' '.join(subcommand_args)
|
args = ' '.join(subcommand_args)
|
||||||
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
|
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
|
||||||
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
|
|
@ -20,11 +20,9 @@ from datetime import datetime, timedelta # noqa
|
||||||
from django.conf import settings # noqa
|
from django.conf import settings # noqa
|
||||||
|
|
||||||
from archivebox import CONSTANTS # noqa
|
from archivebox import CONSTANTS # noqa
|
||||||
from ..main import * # noqa
|
from archivebox.cli import * # noqa
|
||||||
from ..cli import CLI_SUBCOMMANDS
|
|
||||||
|
|
||||||
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||||
CLI_COMMAND_NAMES = ", ".join(CLI_SUBCOMMANDS.keys())
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# load the rich extension for ipython for pretty printing
|
# load the rich extension for ipython for pretty printing
|
||||||
|
@ -40,7 +38,7 @@ if __name__ == '__main__':
|
||||||
prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]')
|
prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]')
|
||||||
prnt('[yellow4]# ArchiveBox Imports[/]')
|
prnt('[yellow4]# ArchiveBox Imports[/]')
|
||||||
prnt('[yellow4]import archivebox[/]')
|
prnt('[yellow4]import archivebox[/]')
|
||||||
prnt('[yellow4]from archivebox.main import {}[/]'.format(CLI_COMMAND_NAMES))
|
prnt('[yellow4]from archivebox.cli import *[/]')
|
||||||
prnt()
|
prnt()
|
||||||
|
|
||||||
if console.width >= 80:
|
if console.width >= 80:
|
||||||
|
|
|
@ -459,8 +459,8 @@ def load_plugins(plugins: Iterable[PluginId | ModuleType | Type] | Dict[PluginId
|
||||||
PLUGINS_TO_LOAD = sorted(PLUGINS_TO_LOAD, key=lambda x: x['order'])
|
PLUGINS_TO_LOAD = sorted(PLUGINS_TO_LOAD, key=lambda x: x['order'])
|
||||||
|
|
||||||
for plugin_info in PLUGINS_TO_LOAD:
|
for plugin_info in PLUGINS_TO_LOAD:
|
||||||
if '--version' not in sys.argv and '--help' not in sys.argv:
|
# if '--version' not in sys.argv and '--help' not in sys.argv:
|
||||||
print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr)
|
# print(f'🧩 Loading plugin: {plugin_info["id"]}...', end='\r', flush=True, file=sys.stderr)
|
||||||
pm.register(plugin_info['module'])
|
pm.register(plugin_info['module'])
|
||||||
LOADED_PLUGINS[plugin_info['id']] = plugin_info
|
LOADED_PLUGINS[plugin_info['id']] = plugin_info
|
||||||
# print('\x1b[2K', end='\r', flush=True, file=sys.stderr)
|
# print('\x1b[2K', end='\r', flush=True, file=sys.stderr)
|
||||||
|
|
|
@ -1,103 +1,103 @@
|
||||||
import uuid
|
# import uuid
|
||||||
from functools import wraps
|
# from functools import wraps
|
||||||
from django.db import connection, transaction
|
# from django.db import connection, transaction
|
||||||
from django.utils import timezone
|
# from django.utils import timezone
|
||||||
from huey.exceptions import TaskLockedException
|
# from huey.exceptions import TaskLockedException
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS
|
# from archivebox.config import CONSTANTS
|
||||||
|
|
||||||
class SqliteSemaphore:
|
# class SqliteSemaphore:
|
||||||
def __init__(self, db_path, table_name, name, value=1, timeout=None):
|
# def __init__(self, db_path, table_name, name, value=1, timeout=None):
|
||||||
self.db_path = db_path
|
# self.db_path = db_path
|
||||||
self.table_name = table_name
|
# self.table_name = table_name
|
||||||
self.name = name
|
# self.name = name
|
||||||
self.value = value
|
# self.value = value
|
||||||
self.timeout = timeout or 86400 # Set a max age for lock holders
|
# self.timeout = timeout or 86400 # Set a max age for lock holders
|
||||||
|
|
||||||
# Ensure the table exists
|
# # Ensure the table exists
|
||||||
with connection.cursor() as cursor:
|
# with connection.cursor() as cursor:
|
||||||
cursor.execute(f"""
|
# cursor.execute(f"""
|
||||||
CREATE TABLE IF NOT EXISTS {self.table_name} (
|
# CREATE TABLE IF NOT EXISTS {self.table_name} (
|
||||||
id TEXT PRIMARY KEY,
|
# id TEXT PRIMARY KEY,
|
||||||
name TEXT,
|
# name TEXT,
|
||||||
timestamp DATETIME
|
# timestamp DATETIME
|
||||||
)
|
# )
|
||||||
""")
|
# """)
|
||||||
|
|
||||||
def acquire(self, name=None):
|
# def acquire(self, name=None):
|
||||||
name = name or str(uuid.uuid4())
|
# name = name or str(uuid.uuid4())
|
||||||
now = timezone.now()
|
# now = timezone.now()
|
||||||
expiration = now - timezone.timedelta(seconds=self.timeout)
|
# expiration = now - timezone.timedelta(seconds=self.timeout)
|
||||||
|
|
||||||
with transaction.atomic():
|
# with transaction.atomic():
|
||||||
# Remove expired locks
|
# # Remove expired locks
|
||||||
with connection.cursor() as cursor:
|
# with connection.cursor() as cursor:
|
||||||
cursor.execute(f"""
|
# cursor.execute(f"""
|
||||||
DELETE FROM {self.table_name}
|
# DELETE FROM {self.table_name}
|
||||||
WHERE name = %s AND timestamp < %s
|
# WHERE name = %s AND timestamp < %s
|
||||||
""", [self.name, expiration])
|
# """, [self.name, expiration])
|
||||||
|
|
||||||
# Try to acquire the lock
|
# # Try to acquire the lock
|
||||||
with connection.cursor() as cursor:
|
# with connection.cursor() as cursor:
|
||||||
cursor.execute(f"""
|
# cursor.execute(f"""
|
||||||
INSERT INTO {self.table_name} (id, name, timestamp)
|
# INSERT INTO {self.table_name} (id, name, timestamp)
|
||||||
SELECT %s, %s, %s
|
# SELECT %s, %s, %s
|
||||||
WHERE (
|
# WHERE (
|
||||||
SELECT COUNT(*) FROM {self.table_name}
|
# SELECT COUNT(*) FROM {self.table_name}
|
||||||
WHERE name = %s
|
# WHERE name = %s
|
||||||
) < %s
|
# ) < %s
|
||||||
""", [name, self.name, now, self.name, self.value])
|
# """, [name, self.name, now, self.name, self.value])
|
||||||
|
|
||||||
if cursor.rowcount > 0:
|
# if cursor.rowcount > 0:
|
||||||
return name
|
# return name
|
||||||
|
|
||||||
# If we couldn't acquire the lock, remove our attempted entry
|
# # If we couldn't acquire the lock, remove our attempted entry
|
||||||
with connection.cursor() as cursor:
|
# with connection.cursor() as cursor:
|
||||||
cursor.execute(f"""
|
# cursor.execute(f"""
|
||||||
DELETE FROM {self.table_name}
|
# DELETE FROM {self.table_name}
|
||||||
WHERE id = %s AND name = %s
|
# WHERE id = %s AND name = %s
|
||||||
""", [name, self.name])
|
# """, [name, self.name])
|
||||||
|
|
||||||
return None
|
# return None
|
||||||
|
|
||||||
def release(self, name):
|
# def release(self, name):
|
||||||
with connection.cursor() as cursor:
|
# with connection.cursor() as cursor:
|
||||||
cursor.execute(f"""
|
# cursor.execute(f"""
|
||||||
DELETE FROM {self.table_name}
|
# DELETE FROM {self.table_name}
|
||||||
WHERE id = %s AND name = %s
|
# WHERE id = %s AND name = %s
|
||||||
""", [name, self.name])
|
# """, [name, self.name])
|
||||||
return cursor.rowcount > 0
|
# return cursor.rowcount > 0
|
||||||
|
|
||||||
|
|
||||||
LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3'
|
# LOCKS_DB_PATH = CONSTANTS.DATABASE_FILE.parent / 'locks.sqlite3'
|
||||||
|
|
||||||
|
|
||||||
def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None):
|
# def lock_task_semaphore(db_path, table_name, lock_name, value=1, timeout=None):
|
||||||
"""
|
# """
|
||||||
Lock which can be acquired multiple times (default = 1).
|
# Lock which can be acquired multiple times (default = 1).
|
||||||
|
|
||||||
NOTE: no provisions are made for blocking, waiting, or notifying. This is
|
# NOTE: no provisions are made for blocking, waiting, or notifying. This is
|
||||||
just a lock which can be acquired a configurable number of times.
|
# just a lock which can be acquired a configurable number of times.
|
||||||
|
|
||||||
Example:
|
# Example:
|
||||||
|
|
||||||
# Allow up to 3 workers to run this task concurrently. If the task is
|
# # Allow up to 3 workers to run this task concurrently. If the task is
|
||||||
# locked, retry up to 2 times with a delay of 60s.
|
# # locked, retry up to 2 times with a delay of 60s.
|
||||||
@huey.task(retries=2, retry_delay=60)
|
# @huey.task(retries=2, retry_delay=60)
|
||||||
@lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3)
|
# @lock_task_semaphore('path/to/db.sqlite3', 'semaphore_locks', 'my-lock', 3)
|
||||||
def my_task():
|
# def my_task():
|
||||||
...
|
# ...
|
||||||
"""
|
# """
|
||||||
sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout)
|
# sem = SqliteSemaphore(db_path, table_name, lock_name, value, timeout)
|
||||||
def decorator(fn):
|
# def decorator(fn):
|
||||||
@wraps(fn)
|
# @wraps(fn)
|
||||||
def inner(*args, **kwargs):
|
# def inner(*args, **kwargs):
|
||||||
tid = sem.acquire()
|
# tid = sem.acquire()
|
||||||
if tid is None:
|
# if tid is None:
|
||||||
raise TaskLockedException(f'unable to acquire lock {lock_name}')
|
# raise TaskLockedException(f'unable to acquire lock {lock_name}')
|
||||||
try:
|
# try:
|
||||||
return fn(*args, **kwargs)
|
# return fn(*args, **kwargs)
|
||||||
finally:
|
# finally:
|
||||||
sem.release(tid)
|
# sem.release(tid)
|
||||||
return inner
|
# return inner
|
||||||
return decorator
|
# return decorator
|
||||||
|
|
|
@ -8,7 +8,7 @@ from django_huey import db_task, task
|
||||||
from huey_monitor.models import TaskModel
|
from huey_monitor.models import TaskModel
|
||||||
from huey_monitor.tqdm import ProcessInfo
|
from huey_monitor.tqdm import ProcessInfo
|
||||||
|
|
||||||
from .supervisor_util import get_or_create_supervisord_process
|
from .supervisord_util import get_or_create_supervisord_process
|
||||||
|
|
||||||
# @db_task(queue="commands", context=True, schedule=1)
|
# @db_task(queue="commands", context=True, schedule=1)
|
||||||
# def scheduler_tick():
|
# def scheduler_tick():
|
||||||
|
|
|
@ -115,6 +115,8 @@ dependencies = [
|
||||||
"abx-plugin-mercury>=2024.10.28",
|
"abx-plugin-mercury>=2024.10.28",
|
||||||
"abx-plugin-htmltotext>=2024.10.28",
|
"abx-plugin-htmltotext>=2024.10.28",
|
||||||
"python-statemachine>=2.3.6",
|
"python-statemachine>=2.3.6",
|
||||||
|
"click>=8.1.7",
|
||||||
|
"rich-click>=1.8.4",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|
18
uv.lock
18
uv.lock
|
@ -658,6 +658,7 @@ dependencies = [
|
||||||
{ name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "atomicwrites", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "base32-crockford", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "channels", extra = ["daphne"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "channels", extra = ["daphne"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "croniter", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "croniter", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "dateparser", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "dateparser", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
@ -688,6 +689,7 @@ dependencies = [
|
||||||
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "rich-argparse", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "rich-argparse", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "rich-click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "sonic-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "sonic-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "supervisor", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "supervisor", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
@ -784,6 +786,7 @@ requires-dist = [
|
||||||
{ name = "atomicwrites", specifier = "==1.4.1" },
|
{ name = "atomicwrites", specifier = "==1.4.1" },
|
||||||
{ name = "base32-crockford", specifier = "==0.3.0" },
|
{ name = "base32-crockford", specifier = "==0.3.0" },
|
||||||
{ name = "channels", extras = ["daphne"], specifier = ">=4.1.0" },
|
{ name = "channels", extras = ["daphne"], specifier = ">=4.1.0" },
|
||||||
|
{ name = "click", specifier = ">=8.1.7" },
|
||||||
{ name = "croniter", specifier = ">=3.0.3" },
|
{ name = "croniter", specifier = ">=3.0.3" },
|
||||||
{ name = "dateparser", specifier = ">=1.2.0" },
|
{ name = "dateparser", specifier = ">=1.2.0" },
|
||||||
{ name = "django", specifier = ">=5.1.1,<6.0" },
|
{ name = "django", specifier = ">=5.1.1,<6.0" },
|
||||||
|
@ -821,6 +824,7 @@ requires-dist = [
|
||||||
{ name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" },
|
{ name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" },
|
||||||
{ name = "rich", specifier = ">=13.8.0" },
|
{ name = "rich", specifier = ">=13.8.0" },
|
||||||
{ name = "rich-argparse", specifier = ">=1.5.2" },
|
{ name = "rich-argparse", specifier = ">=1.5.2" },
|
||||||
|
{ name = "rich-click", specifier = ">=1.8.4" },
|
||||||
{ name = "setuptools", specifier = ">=74.1.0" },
|
{ name = "setuptools", specifier = ">=74.1.0" },
|
||||||
{ name = "sonic-client", specifier = ">=1.0.0" },
|
{ name = "sonic-client", specifier = ">=1.0.0" },
|
||||||
{ name = "supervisor", specifier = ">=4.2.5" },
|
{ name = "supervisor", specifier = ">=4.2.5" },
|
||||||
|
@ -2806,6 +2810,20 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/25/45/54b95bb72bb17c27a7252bee5034955020b5869a33918b660ffc29cbf608/rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7", size = 20072 },
|
{ url = "https://files.pythonhosted.org/packages/25/45/54b95bb72bb17c27a7252bee5034955020b5869a33918b660ffc29cbf608/rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7", size = 20072 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rich-click"
|
||||||
|
version = "1.8.4"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/fc/f4/e48dc2850662526a26fb0961aacb0162c6feab934312b109b748ae4efee2/rich_click-1.8.4.tar.gz", hash = "sha256:0f49471f04439269d0e66a6f43120f52d11d594869a2a0be600cfb12eb0616b9", size = 38247 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/84/f3/72f93d8494ee641bde76bfe1208cf4abc44c6f9448673762f6077bc162d6/rich_click-1.8.4-py3-none-any.whl", hash = "sha256:2d2841b3cebe610d5682baa1194beaf78ab00c4fa31931533261b5eba2ee80b7", size = 35071 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ruff"
|
name = "ruff"
|
||||||
version = "0.7.4"
|
version = "0.7.4"
|
||||||
|
|
Loading…
Reference in a new issue