ArchiveBox/archivebox/cli/archivebox_add.py

142 lines
4 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox add'
import sys
import argparse
2019-04-27 21:26:24 +00:00
from typing import List, Optional, IO
2024-10-01 00:25:15 +00:00
from archivebox.misc.util import docstring
2024-10-01 00:44:18 +00:00
from archivebox.config import DATA_DIR, ARCHIVING_CONFIG
from ..main import add
2021-03-20 16:38:00 +00:00
from ..parsers import PARSERS
from ..logging_util import SmartFormatter, accept_stdin, stderr
@docstring(add.__doc__)
2019-04-27 21:26:24 +00:00
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=add.__doc__,
add_help=True,
2019-04-27 21:26:24 +00:00
formatter_class=SmartFormatter,
)
2021-03-27 07:57:05 +00:00
parser.add_argument(
'--tag', '-t',
type=str,
default='',
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
)
parser.add_argument(
'--update', #'-u',
action='store_true',
2024-10-01 00:44:18 +00:00
default=not ARCHIVING_CONFIG.ONLY_NEW, # when ONLY_NEW=True we skip updating old links
2019-04-27 21:26:24 +00:00
help="Also retry previously skipped/failed links when adding new links",
)
parser.add_argument(
'--update-all', #'-n',
action='store_true',
default=False,
help="Also update ALL links in index when finished adding new links",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Add the links to the main index without archiving them",
)
parser.add_argument(
2020-07-13 15:26:30 +00:00
'urls',
nargs='*',
type=str,
default=None,
2019-04-27 21:26:24 +00:00
help=(
2020-07-13 15:26:30 +00:00
'URLs or paths to archive e.g.:\n'
2019-04-27 21:26:24 +00:00
' https://getpocket.com/users/USERNAME/feed/all\n'
' https://example.com/some/rss/feed.xml\n'
' https://example.com\n'
2019-04-27 21:26:24 +00:00
' ~/Downloads/firefox_bookmarks_export.html\n'
' ~/Desktop/sites_list.csv\n'
)
)
parser.add_argument(
"--depth",
action="store",
default=0,
2020-07-13 15:26:30 +00:00
choices=[0, 1],
type=int,
help="Recursively archive all linked pages up to this many hops away"
)
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Re-archive URLs from scratch, overwriting any existing files"
)
parser.add_argument(
2020-11-13 14:24:34 +00:00
"--init", #'-i',
action='store_true',
help="Init/upgrade the curent data directory before adding",
)
2020-11-13 14:24:34 +00:00
parser.add_argument(
"--extract",
type=str,
2020-11-13 14:24:34 +00:00
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
2020-11-13 14:24:34 +00:00
)
2021-03-20 16:38:00 +00:00
parser.add_argument(
"--parser",
type=str,
help="Parser used to read inputted URLs.",
default="auto",
2021-03-31 04:47:42 +00:00
choices=["auto", *PARSERS.keys()],
2021-03-20 16:38:00 +00:00
)
2019-04-27 21:26:24 +00:00
command = parser.parse_args(args or ())
2020-07-13 15:26:30 +00:00
urls = command.urls
stdin_urls = ''
if not urls:
stdin_urls = accept_stdin(stdin)
2020-07-13 15:26:30 +00:00
if (stdin_urls and urls) or (not stdin and not urls):
stderr(
2020-07-13 15:26:30 +00:00
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
color='red',
)
raise SystemExit(2)
2019-04-27 21:26:24 +00:00
add(
2020-07-13 15:26:30 +00:00
urls=stdin_urls or urls,
depth=command.depth,
2021-03-27 07:57:05 +00:00
tag=command.tag,
update=command.update,
2019-04-27 21:26:24 +00:00
update_all=command.update_all,
index_only=command.index_only,
overwrite=command.overwrite,
init=command.init,
extractors=command.extract,
2021-03-20 16:38:00 +00:00
parser=command.parser,
2024-10-01 00:44:18 +00:00
out_dir=pwd or DATA_DIR,
)
if __name__ == '__main__':
2019-04-27 21:26:24 +00:00
main(args=sys.argv[1:], stdin=sys.stdin)
# TODO: Implement these
#
# parser.add_argument(
# '--mirror', #'-m',
# action='store_true',
# help='Archive an entire site (finding all linked pages below it on the same domain)',
# )
# parser.add_argument(
# '--crawler', #'-r',
# choices=('depth_first', 'breadth_first'),
# help='Controls which crawler to use in order to find outlinks in a given page',
# default=None,
# )