mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 06:00:22 +00:00
84 lines
2.3 KiB
Python
84 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
|
|
__package__ = 'archivebox.cli'
|
|
__command__ = 'archivebox add'
|
|
__description__ = 'Add a new URL or list of URLs to your archive'
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
|
|
from ..legacy.util import (
|
|
handle_stdin_import,
|
|
handle_file_import,
|
|
)
|
|
from ..legacy.main import update_archive_data
|
|
|
|
|
|
def main(args=None):
|
|
args = sys.argv[1:] if args is None else args
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog=__command__,
|
|
description=__description__,
|
|
add_help=True,
|
|
)
|
|
# parser.add_argument(
|
|
# '--depth', #'-d',
|
|
# type=int,
|
|
# help='Recursively archive all linked pages up to this many hops away',
|
|
# default=0,
|
|
# )
|
|
parser.add_argument(
|
|
'--only-new', #'-n',
|
|
action='store_true',
|
|
help="Don't attempt to retry previously skipped/failed links when updating",
|
|
)
|
|
parser.add_argument(
|
|
'--mirror', #'-m',
|
|
action='store_true',
|
|
help='Archive an entire site (finding all linked pages below it on the same domain)',
|
|
)
|
|
parser.add_argument(
|
|
'--crawler', #'-r',
|
|
choices=('depth_first', 'breadth_first'),
|
|
help='Controls which crawler to use in order to find outlinks in a given page',
|
|
default=None,
|
|
)
|
|
parser.add_argument(
|
|
'url',
|
|
nargs='?',
|
|
type=str,
|
|
default=None,
|
|
help='URL of page to archive (or path to local file)'
|
|
)
|
|
command = parser.parse_args(args)
|
|
|
|
### Handle ingesting urls piped in through stdin
|
|
# (.e.g if user does cat example_urls.txt | ./archive)
|
|
import_path = None
|
|
if not sys.stdin.isatty():
|
|
stdin_raw_text = sys.stdin.read()
|
|
if stdin_raw_text and command.url:
|
|
print(
|
|
'[X] You should pass either a path as an argument, '
|
|
'or pass a list of links via stdin, but not both.\n'
|
|
)
|
|
raise SystemExit(1)
|
|
|
|
import_path = handle_stdin_import(stdin_raw_text)
|
|
|
|
### Handle ingesting url from a remote file/feed
|
|
# (e.g. if an RSS feed URL is used as the import path)
|
|
elif command.url:
|
|
import_path = handle_file_import(command.url)
|
|
|
|
|
|
update_archive_data(
|
|
import_path=import_path,
|
|
resume=None,
|
|
only_new=command.only_new,
|
|
)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|