working archivebox_schedule cmd

This commit is contained in:
Nick Sweeting 2024-11-19 03:54:47 -08:00
parent 3a64ced697
commit 292730ebad
No known key found for this signature in database

View file

@ -1,38 +1,43 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
import sys import sys
import argparse
from pathlib import Path from pathlib import Path
from typing import Optional, List, IO
from archivebox.misc.util import docstring import rich_click as click
from archivebox.config import DATA_DIR
from archivebox.misc.logging_util import SmartFormatter, reject_stdin from archivebox.misc.util import enforce_types, docstring
from archivebox.config.common import ARCHIVING_CONFIG from archivebox.config import DATA_DIR, CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from archivebox.misc.logging_util import stderr
from archivebox.config.permissions import USER
# @enforce_types CRON_COMMENT = 'ArchiveBox'
@enforce_types
def schedule(add: bool=False, def schedule(add: bool=False,
show: bool=False, show: bool=False,
clear: bool=False, clear: bool=False,
foreground: bool=False, foreground: bool=False,
run_all: bool=False, run_all: bool=False,
quiet: bool=False, quiet: bool=False,
every: Optional[str]=None, every: str | None=None,
tag: str='', tag: str='',
depth: int=0, depth: int | str=0,
overwrite: bool=False, overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW, update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
import_path: Optional[str]=None, import_path: str | None=None,
out_dir: Path=DATA_DIR): out_dir: Path=DATA_DIR) -> None:
"""Set ArchiveBox to regularly import URLs at specific times using cron""" """Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder() depth = int(depth)
from crontab import CronTab, CronSlices
from archivebox.misc.system import dedupe_cron_jobs
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
@ -65,7 +70,6 @@ def schedule(add: bool=False,
'>>', '>>',
quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'), quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
'2>&1', '2>&1',
] ]
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
@ -83,10 +87,11 @@ def schedule(add: bool=False,
raise SystemExit(1) raise SystemExit(1)
cron = dedupe_cron_jobs(cron) cron = dedupe_cron_jobs(cron)
print(cron)
cron.write() cron.write()
total_runs = sum(j.frequency_per_year() for j in cron) total_runs = sum(j.frequency_per_year() for j in cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT)) existing_jobs = list(cron.find_command('archivebox'))
print() print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
@ -108,10 +113,6 @@ def schedule(add: bool=False,
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(0) raise SystemExit(0)
cron = CronTab(user=True)
cron = dedupe_cron_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if foreground or run_all: if foreground or run_all:
if not existing_jobs: if not existing_jobs:
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI)) stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
@ -141,108 +142,25 @@ def schedule(add: bool=False,
print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
raise SystemExit(1) raise SystemExit(1)
# if CAN_UPGRADE:
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
@click.command()
@click.option('--quiet', '-q', is_flag=True, help="Don't warn about storage space")
@click.option('--add', is_flag=True, help='Add a new scheduled ArchiveBox update job to cron')
@click.option('--every', type=str, help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")')
@click.option('--tag', '-t', default='', help='Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3')
@click.option('--depth', type=click.Choice(['0', '1']), default='0', help='Depth to archive to [0] or 1')
@click.option('--overwrite', is_flag=True, help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots')
@click.option('--update', is_flag=True, help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults')
@click.option('--clear', is_flag=True, help='Stop all ArchiveBox scheduled runs (remove cron jobs)')
@click.option('--show', is_flag=True, help='Print a list of currently active ArchiveBox cron jobs')
@click.option('--foreground', '-f', is_flag=True, help='Launch ArchiveBox scheduler as a long-running foreground task instead of using cron')
@click.option('--run-all', is_flag=True, help='Run all the scheduled jobs once immediately, independent of their configured schedules')
@click.argument('import_path', required=False)
@docstring(schedule.__doc__) @docstring(schedule.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: def main(**kwargs):
parser = argparse.ArgumentParser( """Set ArchiveBox to regularly import URLs at specific times using cron"""
prog=__command__, schedule(**kwargs)
description=schedule.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help=("Don't warn about storage space."),
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--add', # '-a',
action='store_true',
help='Add a new scheduled ArchiveBox update job to cron',
)
parser.add_argument(
'--every', # '-e',
type=str,
default=None,
help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")',
)
parser.add_argument(
'--tag', '-t',
type=str,
default='',
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
)
parser.add_argument(
'--depth', # '-d',
type=int,
choices=[0, 1],
default=0,
help='Depth to archive to [0] or 1, see "add" command help for more info',
)
parser.add_argument(
'--overwrite',
action='store_true',
help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
)
parser.add_argument(
'--update',
action='store_true',
help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
)
group.add_argument(
'--clear', # '-c'
action='store_true',
help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
)
group.add_argument(
'--show', # '-s'
action='store_true',
help=("Print a list of currently active ArchiveBox cron jobs"),
)
group.add_argument(
'--foreground', '-f',
action='store_true',
help=("Launch ArchiveBox scheduler as a long-running foreground task "
"instead of using cron."),
)
group.add_argument(
'--run-all', # '-a',
action='store_true',
help=("Run all the scheduled jobs once immediately, independent of "
"their configured schedules, can be used together with --foreground"),
)
parser.add_argument(
'import_path',
nargs='?',
type=str,
default=None,
help=("Check this path and import any new links on every run "
"(can be either local file or remote URL)"),
)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
schedule(
add=command.add,
show=command.show,
clear=command.clear,
foreground=command.foreground,
run_all=command.run_all,
quiet=command.quiet,
every=command.every,
tag=command.tag,
depth=command.depth,
overwrite=command.overwrite,
update=command.update,
import_path=command.import_path,
out_dir=Path(pwd) if pwd else DATA_DIR,
)
if __name__ == '__main__': if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin) main()