ArchiveBox/archivebox/cli/archivebox_add.py

#!/usr/bin/env python3

__package__ = 'archivebox.cli'
__command__ = 'archivebox add'

import sys
import argparse

from typing import IO, TYPE_CHECKING


from django.utils import timezone
from django.db.models import QuerySet


from archivebox import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.django import setup_django
from archivebox.config.permissions import USER, HOSTNAME
from archivebox.misc.checks import check_data_folder
from archivebox.parsers import PARSERS
from archivebox.logging_util import SmartFormatter, accept_stdin, stderr

from abid_utils.models import get_or_create_system_user_pk

if TYPE_CHECKING:
    from core.models import Snapshot


ORCHESTRATOR = None


def add(urls: str | list[str],
        tag: str='',
        depth: int=0,
        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
        update_all: bool=False,
        index_only: bool=False,
        overwrite: bool=False,
        extractors: str="",
        parser: str="auto",
        persona: str='Default',
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive"""

    global ORCHESTRATOR

    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'

    # 0. setup abx, django, check_data_folder
    setup_django()
    check_data_folder()
    
    
    from seeds.models import Seed
    from crawls.models import Crawl
    from actors.orchestrator import Orchestrator

    
    created_by_id = created_by_id or get_or_create_system_user_pk()
    
    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
    
    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
    cmd = ' '.join(sys.argv)
    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd}', parser=parser, tag=tag, created_by=created_by_id, config={
        'ONLY_NEW': not update,
        'INDEX_ONLY': index_only,
        'OVERWRITE': overwrite,
        'EXTRACTORS': extractors,
        'DEFAULT_PERSONA': persona or 'Default',
    })
    # 3. create a new Crawl pointing to the Seed
    crawl = Crawl.from_seed(seed, max_depth=depth)
    
    # 4. start the Orchestrator & wait until it completes
    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
    # from crawls.actors import CrawlActor
    # from core.actors import SnapshotActor, ArchiveResultActor

    orchestrator = Orchestrator(exit_on_idle=True)
    orchestrator.start()
    
    # 5. return the list of new Snapshots created
    return crawl.snapshot_set.all()


def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=None) -> None:
    """Add a new URL or list of URLs to your archive"""
    parser = argparse.ArgumentParser(
        prog=__command__,
        description=add.__doc__,
        add_help=True,
        formatter_class=SmartFormatter,
    )
    parser.add_argument(
        '--tag', '-t',
        type=str,
        default='',
        help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
    )
    parser.add_argument(
        '--update', #'-u',
        action='store_true',
        default=not ARCHIVING_CONFIG.ONLY_NEW,  # when ONLY_NEW=True we skip updating old links
        help="Also retry previously skipped/failed links when adding new links",
    )
    parser.add_argument(
        '--update-all', #'-n',
        action='store_true',
        default=False, 
        help="Also update ALL links in index when finished adding new links",
    )
    parser.add_argument(
        '--index-only', #'-o',
        action='store_true',
        help="Add the links to the main index without archiving them",
    )
    parser.add_argument(
        'urls',
        nargs='*',
        type=str,
        default=None,
        help=(
            'URLs or paths to archive e.g.:\n'
            '    https://getpocket.com/users/USERNAME/feed/all\n'
            '    https://example.com/some/rss/feed.xml\n'
            '    https://example.com\n'
            '    ~/Downloads/firefox_bookmarks_export.html\n'
            '    ~/Desktop/sites_list.csv\n'
        )
    )
    parser.add_argument(
        "--depth",
        action="store",
        default=0,
        choices=[0, 1],
        type=int,
        help="Recursively archive all linked pages up to this many hops away"
    )
    parser.add_argument(
        "--overwrite",
        default=False,
        action="store_true",
        help="Re-archive URLs from scratch, overwriting any existing files"
    )
    parser.add_argument(
        "--extract", '-e',
        type=str,
        help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
              This does not take precedence over the configuration",
        default=""
    )
    parser.add_argument(
        "--parser",
        type=str,
        help="Parser used to read inputted URLs.",
        default="auto",
        choices=["auto", *PARSERS.keys()],
    )
    parser.add_argument(
        "--persona",
        type=str,
        help="Name of accounts persona to use when archiving.",
        default="Default",
    )
    command = parser.parse_args(args or ())
    urls = command.urls

    stdin_urls = ''
    if not urls:
        stdin_urls = accept_stdin(stdin)

    if (stdin_urls and urls) or (not stdin and not urls):
        stderr(
            '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
            color='red',
        )
        raise SystemExit(2)
    add(
        urls=stdin_urls or urls,
        depth=command.depth,
        tag=command.tag,
        update=command.update,
        update_all=command.update_all,
        index_only=command.index_only,
        overwrite=command.overwrite,
        extractors=command.extract,
        parser=command.parser,
        persona=command.persona,
    )


if __name__ == '__main__':
    main(args=sys.argv[1:], stdin=sys.stdin)
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`#!/usr/bin/env python3`

			`__package__ = 'archivebox.cli'`
			`__command__ = 'archivebox add'`

			`import sys`
			`import argparse`

update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00			`from typing import IO, TYPE_CHECKING`
add index-only option to archivebox add and update 2019-04-25 22:59:41 +00:00
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00
			`from django.utils import timezone`
			`from django.db.models import QuerySet`


			`from archivebox import CONSTANTS`
improve config loading of TMP_DIR, LIB_DIR, move to separate files 2024-10-08 06:45:11 +00:00			`from archivebox.config.common import ARCHIVING_CONFIG`
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00			`from archivebox.config.django import setup_django`
			`from archivebox.config.permissions import USER, HOSTNAME`
			`from archivebox.misc.checks import check_data_folder`
			`from archivebox.parsers import PARSERS`
			`from archivebox.logging_util import SmartFormatter, accept_stdin, stderr`

			`from abid_utils.models import get_or_create_system_user_pk`

			`if TYPE_CHECKING:`
			`from core.models import Snapshot`


			`ORCHESTRATOR = None`


			`def add(urls: str \| list[str],`
			`tag: str='',`
			`depth: int=0,`
			`update: bool=not ARCHIVING_CONFIG.ONLY_NEW,`
			`update_all: bool=False,`
			`index_only: bool=False,`
			`overwrite: bool=False,`
			`extractors: str="",`
			`parser: str="auto",`
			`persona: str='Default',`
			`created_by_id: int \| None=None) -> QuerySet['Snapshot']:`
			`"""Add a new URL or list of URLs to your archive"""`

			`global ORCHESTRATOR`

			`assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'`
rename OUTPUT_DIR to DATA_DIR 2024-10-01 00:44:18 +00:00
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00			`# 0. setup abx, django, check_data_folder`
			`setup_django()`
			`check_data_folder()`


			`from seeds.models import Seed`
			`from crawls.models import Crawl`
			`from actors.orchestrator import Orchestrator`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00
			`created_by_id = created_by_id or get_or_create_system_user_pk()`

			`# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt`
			`sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'`
			`sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))`

			`# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt`
			`cmd = ' '.join(sys.argv)`
			`seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd}', parser=parser, tag=tag, created_by=created_by_id, config={`
			`'ONLY_NEW': not update,`
			`'INDEX_ONLY': index_only,`
			`'OVERWRITE': overwrite,`
			`'EXTRACTORS': extractors,`
			`'DEFAULT_PERSONA': persona or 'Default',`
			`})`
			`# 3. create a new Crawl pointing to the Seed`
			`crawl = Crawl.from_seed(seed, max_depth=depth)`

			`# 4. start the Orchestrator & wait until it completes`
			`# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...`
			`# from crawls.actors import CrawlActor`
			`# from core.actors import SnapshotActor, ArchiveResultActor`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00			`orchestrator = Orchestrator(exit_on_idle=True)`
			`orchestrator.start()`

			`# 5. return the list of new Snapshots created`
			`return crawl.snapshot_set.all()`


			`def main(args: list[str] \| None=None, stdin: IO \| None=None, pwd: str \| None=None) -> None:`
			`"""Add a new URL or list of URLs to your archive"""`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`parser = argparse.ArgumentParser(`
			`prog=__command__,`
move docstrings to main.py out of cli files 2019-05-01 03:10:48 +00:00			`description=add.__doc__,`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`add_help=True,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`formatter_class=SmartFormatter,`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`)`
add tag cli option 2021-03-27 07:57:05 +00:00			`parser.add_argument(`
			`'--tag', '-t',`
			`type=str,`
			`default='',`
			`help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",`
			`)`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`parser.add_argument(`
cleanup update flag handling and show better logging to clarify when its working 2022-05-10 03:15:55 +00:00			`'--update', #'-u',`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`action='store_true',`
rename OUTPUT_DIR to DATA_DIR 2024-10-01 00:44:18 +00:00			`default=not ARCHIVING_CONFIG.ONLY_NEW, # when ONLY_NEW=True we skip updating old links`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`help="Also retry previously skipped/failed links when adding new links",`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`)`
cleanup update flag handling and show better logging to clarify when its working 2022-05-10 03:15:55 +00:00			`parser.add_argument(`
			`'--update-all', #'-n',`
			`action='store_true',`
			`default=False,`
			`help="Also update ALL links in index when finished adding new links",`
			`)`
add index-only option to archivebox add and update 2019-04-25 22:59:41 +00:00			`parser.add_argument(`
			`'--index-only', #'-o',`
			`action='store_true',`
			`help="Add the links to the main index without archiving them",`
			`)`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`parser.add_argument(`
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`'urls',`
			`nargs='*',`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`type=str,`
			`default=None,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`help=(`
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`'URLs or paths to archive e.g.:\n'`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`' https://getpocket.com/users/USERNAME/feed/all\n'`
			`' https://example.com/some/rss/feed.xml\n'`
feat: Disable stdin from archivebox add 2020-07-07 17:39:36 +00:00			`' https://example.com\n'`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`' ~/Downloads/firefox_bookmarks_export.html\n'`
			`' ~/Desktop/sites_list.csv\n'`
			`)`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`)`
feat: Enable --depth flag (still does nothing) 2020-07-07 14:10:36 +00:00			`parser.add_argument(`
			`"--depth",`
			`action="store",`
			`default=0,`
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`choices=[0, 1],`
feat: Enable --depth flag (still does nothing) 2020-07-07 14:10:36 +00:00			`type=int,`
			`help="Recursively archive all linked pages up to this many hops away"`
			`)`
add overwrite flag to add command to force re-archiving 2020-08-18 08:37:54 +00:00			`parser.add_argument(`
			`"--overwrite",`
			`default=False,`
			`action="store_true",`
			`help="Re-archive URLs from scratch, overwriting any existing files"`
			`)`
add common code extensions to default blacklist 2020-08-18 12:12:10 +00:00			`parser.add_argument(`
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00			`"--extract", '-e',`
fix: Use a comma separated input instead of nargs for the extract flag 2020-11-13 18:01:11 +00:00			`type=str,`
feat: Add extract flag to add command 2020-11-13 14:24:34 +00:00			`help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \`
fix: Use a comma separated input instead of nargs for the extract flag 2020-11-13 18:01:11 +00:00			`This does not take precedence over the configuration",`
			`default=""`
feat: Add extract flag to add command 2020-11-13 14:24:34 +00:00			`)`
add command: --parser option 2021-03-20 16:38:00 +00:00			`parser.add_argument(`
			`"--parser",`
			`type=str,`
			`help="Parser used to read inputted URLs.",`
			`default="auto",`
change list style 2021-03-31 04:47:42 +00:00			`choices=["auto", *PARSERS.keys()],`
add command: --parser option 2021-03-20 16:38:00 +00:00			`)`
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00			`parser.add_argument(`
			`"--persona",`
			`type=str,`
			`help="Name of accounts persona to use when archiving.",`
			`default="Default",`
			`)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`command = parser.parse_args(args or ())`
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`urls = command.urls`
only accept stdin if args are not passed, fix stdin hang in docker 2021-02-16 06:20:47 +00:00
			`stdin_urls = ''`
			`if not urls:`
			`stdin_urls = accept_stdin(stdin)`

fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`if (stdin_urls and urls) or (not stdin and not urls):`
feat: Make input sent via stdin behave the same as using args 2020-07-07 20:46:45 +00:00			`stderr(`
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',`
feat: Make input sent via stdin behave the same as using args 2020-07-07 20:46:45 +00:00			`color='red',`
			`)`
			`raise SystemExit(2)`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`add(`
fix depth flag and tweak logging 2020-07-13 15:26:30 +00:00			`urls=stdin_urls or urls,`
refactor: Change add() to receive url and depth instead of import_str and import_path 2020-07-08 13:17:47 +00:00			`depth=command.depth,`
add tag cli option 2021-03-27 07:57:05 +00:00			`tag=command.tag,`
cleanup update flag handling and show better logging to clarify when its working 2022-05-10 03:15:55 +00:00			`update=command.update,`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`update_all=command.update_all,`
add index-only option to archivebox add and update 2019-04-25 22:59:41 +00:00			`index_only=command.index_only,`
add overwrite flag to add command to force re-archiving 2020-08-18 08:37:54 +00:00			`overwrite=command.overwrite,`
Update archivebox/cli/archivebox_add.py 2020-11-13 19:52:21 +00:00			`extractors=command.extract,`
add command: --parser option 2021-03-20 16:38:00 +00:00			`parser=command.parser,`
update archivebox add CLI command to use new actor system 2024-11-16 10:45:37 +00:00			`persona=command.persona,`
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`)`

add pipenv, schedule cmd, logs dir, and lots more 2019-04-19 01:09:54 +00:00
working argparse based CLI with most commands implemented 2019-04-03 04:27:37 +00:00			`if __name__ == '__main__':`
move everything out of legacy folder 2019-04-27 21:26:24 +00:00			`main(args=sys.argv[1:], stdin=sys.stdin)`