restore missing archivebox_update work

This commit is contained in:
Nick Sweeting 2024-11-19 05:09:19 -08:00
parent 52446b86ba
commit f8e2f7c753
No known key found for this signature in database

View file

@ -1,13 +1,13 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox update'
import sys
import argparse
from typing import List, Optional, IO
from archivebox.misc.util import docstring
import rich_click as click
from typing import Iterable
from archivebox.misc.util import enforce_types, docstring
from archivebox.index import (
LINK_FILTERS,
get_indexed_folders,
@ -21,8 +21,66 @@ from archivebox.index import (
get_corrupted_folders,
get_unrecognized_folders,
)
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
# from ..main import update
@enforce_types
def update(filter_patterns: Iterable[str]=(),
only_new: bool=False,
index_only: bool=False,
resume: float | None=None,
overwrite: bool=False,
before: float | None=None,
after: float | None=None,
status: str='indexed',
filter_type: str='exact',
extract: str="") -> None:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from archivebox.config.django import setup_django
setup_django()
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
@click.command()
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
@click.option('--status', type=click.Choice([
'indexed', 'archived', 'unarchived',
'present', 'valid', 'invalid',
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
]), default='indexed', help=f'''
Update only links or data directories that have the given status:
indexed {get_indexed_folders.__doc__} (the default)
archived {get_archived_folders.__doc__}
unarchived {get_unarchived_folders.__doc__}
present {get_present_folders.__doc__}
valid {get_valid_folders.__doc__}
invalid {get_invalid_folders.__doc__}
duplicate {get_duplicate_folders.__doc__}
orphaned {get_orphaned_folders.__doc__}
corrupted {get_corrupted_folders.__doc__}
unrecognized {get_unrecognized_folders.__doc__}
''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
update(**kwargs)
if __name__ == '__main__':
main()
@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin
# # Step 4: Re-write links index with updated titles, icons, and resources
# all_links = load_main_index(out_dir=out_dir)
# return all_links
def update():
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from archivebox.config.django import setup_django
setup_django()
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
@docstring(update.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=update.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--only-new', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Update the main index without archiving any content",
)
parser.add_argument(
'--resume', #'-r',
type=float,
help='Resume the update process from a given timestamp',
default=None,
)
parser.add_argument(
'--overwrite', #'-x',
action='store_true',
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
)
parser.add_argument(
'--before', #'-b',
type=float,
help="Update only links bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="Update only links bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--status',
type=str,
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
default='indexed',
help=(
'Update only links or data directories that have the given status\n'
f' indexed {get_indexed_folders.__doc__} (the default)\n'
f' archived {get_archived_folders.__doc__}\n'
f' unarchived {get_unarchived_folders.__doc__}\n'
'\n'
f' present {get_present_folders.__doc__}\n'
f' valid {get_valid_folders.__doc__}\n'
f' invalid {get_invalid_folders.__doc__}\n'
'\n'
f' duplicate {get_duplicate_folders.__doc__}\n'
f' orphaned {get_orphaned_folders.__doc__}\n'
f' corrupted {get_corrupted_folders.__doc__}\n'
f' unrecognized {get_unrecognized_folders.__doc__}\n'
)
)
parser.add_argument(
'--filter-type', '-t',
type=str,
choices=(*LINK_FILTERS.keys(), 'search'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'filter_patterns',
nargs='*',
type=str,
default=None,
help='Update only URLs matching these filter patterns.'
)
parser.add_argument(
"--extract",
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
command = parser.parse_args(args or ())
filter_patterns_str = None
if not command.filter_patterns:
filter_patterns_str = accept_stdin(stdin)
update()
# update(
# resume=command.resume,
# only_new=command.only_new,
# index_only=command.index_only,
# overwrite=command.overwrite,
# filter_patterns_str=filter_patterns_str,
# filter_patterns=command.filter_patterns,
# filter_type=command.filter_type,
# status=command.status,
# after=command.after,
# before=command.before,
# out_dir=Path(pwd) if pwd else DATA_DIR,
# extractors=command.extract,
# )
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)