2019-04-03 04:27:37 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
__package__ = 'archivebox.cli'
|
|
|
|
__command__ = 'archivebox update'
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import argparse
|
|
|
|
|
2019-04-27 21:26:24 +00:00
|
|
|
from typing import List, Optional, IO
|
2019-04-03 04:27:37 +00:00
|
|
|
|
2020-07-24 17:25:25 +00:00
|
|
|
from ..main import update
|
|
|
|
from ..util import docstring
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..config import OUTPUT_DIR
|
|
|
|
from ..index import (
|
|
|
|
get_indexed_folders,
|
|
|
|
get_archived_folders,
|
|
|
|
get_unarchived_folders,
|
|
|
|
get_present_folders,
|
|
|
|
get_valid_folders,
|
|
|
|
get_invalid_folders,
|
|
|
|
get_duplicate_folders,
|
|
|
|
get_orphaned_folders,
|
|
|
|
get_corrupted_folders,
|
|
|
|
get_unrecognized_folders,
|
|
|
|
)
|
2020-07-22 16:02:13 +00:00
|
|
|
from ..logging_util import SmartFormatter, accept_stdin
|
2019-04-03 04:27:37 +00:00
|
|
|
|
|
|
|
|
2019-05-01 03:10:48 +00:00
|
|
|
@docstring(update.__doc__)
|
2019-04-27 21:26:24 +00:00
|
|
|
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
2019-04-03 04:27:37 +00:00
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
prog=__command__,
|
2019-05-01 03:10:48 +00:00
|
|
|
description=update.__doc__,
|
2019-04-03 04:27:37 +00:00
|
|
|
add_help=True,
|
2019-04-27 21:26:24 +00:00
|
|
|
formatter_class=SmartFormatter,
|
2019-04-03 04:27:37 +00:00
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--only-new', #'-n',
|
|
|
|
action='store_true',
|
|
|
|
help="Don't attempt to retry previously skipped/failed links when updating",
|
|
|
|
)
|
2019-04-25 22:59:41 +00:00
|
|
|
parser.add_argument(
|
|
|
|
'--index-only', #'-o',
|
|
|
|
action='store_true',
|
|
|
|
help="Update the main index without archiving any content",
|
|
|
|
)
|
2019-04-03 04:27:37 +00:00
|
|
|
parser.add_argument(
|
|
|
|
'--resume', #'-r',
|
|
|
|
type=float,
|
|
|
|
help='Resume the update process from a given timestamp',
|
|
|
|
default=None,
|
|
|
|
)
|
2019-04-27 21:26:24 +00:00
|
|
|
parser.add_argument(
|
|
|
|
'--overwrite', #'-x',
|
|
|
|
action='store_true',
|
|
|
|
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--before', #'-b',
|
|
|
|
type=float,
|
|
|
|
help="Update only links bookmarked before the given timestamp.",
|
|
|
|
default=None,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--after', #'-a',
|
|
|
|
type=float,
|
|
|
|
help="Update only links bookmarked after the given timestamp.",
|
|
|
|
default=None,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--status',
|
|
|
|
type=str,
|
|
|
|
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
|
|
|
default='indexed',
|
|
|
|
help=(
|
|
|
|
'Update only links or data directories that have the given status\n'
|
|
|
|
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
|
|
|
f' archived {get_archived_folders.__doc__}\n'
|
|
|
|
f' unarchived {get_unarchived_folders.__doc__}\n'
|
|
|
|
'\n'
|
|
|
|
f' present {get_present_folders.__doc__}\n'
|
|
|
|
f' valid {get_valid_folders.__doc__}\n'
|
|
|
|
f' invalid {get_invalid_folders.__doc__}\n'
|
|
|
|
'\n'
|
|
|
|
f' duplicate {get_duplicate_folders.__doc__}\n'
|
|
|
|
f' orphaned {get_orphaned_folders.__doc__}\n'
|
|
|
|
f' corrupted {get_corrupted_folders.__doc__}\n'
|
|
|
|
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
|
|
|
)
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--filter-type',
|
|
|
|
type=str,
|
2020-11-23 18:41:35 +00:00
|
|
|
choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
|
2019-04-27 21:26:24 +00:00
|
|
|
default='exact',
|
|
|
|
help='Type of pattern matching to use when filtering URLs',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'filter_patterns',
|
|
|
|
nargs='*',
|
|
|
|
type=str,
|
|
|
|
default=None,
|
2019-05-01 03:10:48 +00:00
|
|
|
help='Update only URLs matching these filter patterns.'
|
2019-04-27 21:26:24 +00:00
|
|
|
)
|
2020-12-05 17:20:47 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--extract",
|
|
|
|
type=str,
|
|
|
|
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
|
|
|
This does not take precedence over the configuration",
|
|
|
|
default=""
|
|
|
|
)
|
2019-05-01 03:10:48 +00:00
|
|
|
command = parser.parse_args(args or ())
|
2019-04-27 21:26:24 +00:00
|
|
|
filter_patterns_str = accept_stdin(stdin)
|
2019-04-03 04:27:37 +00:00
|
|
|
|
2019-04-27 21:26:24 +00:00
|
|
|
update(
|
2019-04-03 04:27:37 +00:00
|
|
|
resume=command.resume,
|
|
|
|
only_new=command.only_new,
|
2019-04-25 22:59:41 +00:00
|
|
|
index_only=command.index_only,
|
2019-04-27 21:26:24 +00:00
|
|
|
overwrite=command.overwrite,
|
|
|
|
filter_patterns_str=filter_patterns_str,
|
|
|
|
filter_patterns=command.filter_patterns,
|
|
|
|
filter_type=command.filter_type,
|
|
|
|
status=command.status,
|
|
|
|
after=command.after,
|
|
|
|
before=command.before,
|
|
|
|
out_dir=pwd or OUTPUT_DIR,
|
2020-12-05 17:20:47 +00:00
|
|
|
extractors=command.extract,
|
2019-04-03 04:27:37 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2019-04-27 21:26:24 +00:00
|
|
|
main(args=sys.argv[1:], stdin=sys.stdin)
|