ArchiveBox/archivebox/main.py

1448 lines
57 KiB
Python
Raw Normal View History

2019-04-27 21:26:24 +00:00
__package__ = 'archivebox'
import os
import time
2019-04-27 21:26:24 +00:00
import sys
import shutil
import platform
from typing import Dict, List, Optional, Iterable, IO, Union
from pathlib import Path
from datetime import date, datetime
2019-04-27 21:26:24 +00:00
from crontab import CronTab, CronSlices
from django.db.models import QuerySet
from django.utils import timezone
2019-04-27 21:26:24 +00:00
from .cli import (
CLI_SUBCOMMANDS,
2019-04-27 21:26:24 +00:00
run_subcommand,
display_first,
meta_cmds,
main_cmds,
archive_cmds,
)
2019-05-01 03:13:04 +00:00
from .parsers import (
2020-07-13 15:26:30 +00:00
save_text_as_source,
save_file_as_source,
2020-07-29 16:19:06 +00:00
parse_links_memory,
2019-04-27 21:26:24 +00:00
)
2019-05-01 03:13:04 +00:00
from .index.schema import Link
from .util import enforce_types # type: ignore
2019-05-01 03:13:04 +00:00
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .system import run as run_shell
2019-04-27 21:26:24 +00:00
from .index import (
load_main_index,
2020-07-13 15:26:30 +00:00
parse_links_from_source,
dedupe_links,
2019-04-27 21:26:24 +00:00
write_main_index,
snapshot_filter,
2019-04-27 21:26:24 +00:00
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
fix_invalid_folder_locations,
write_link_details,
2019-04-27 21:26:24 +00:00
)
from .index.json import (
parse_json_main_index,
parse_json_links_details,
generate_json_index_from_links,
2019-04-27 21:26:24 +00:00
)
2019-05-01 03:13:04 +00:00
from .index.sql import (
get_admins,
apply_migrations,
remove_from_sql_main_index,
2019-05-01 03:13:04 +00:00
)
from .index.html import (
generate_index_from_links,
)
2020-11-28 18:11:15 +00:00
from .index.csv import links_to_csv
from .extractors import archive_links, archive_link, ignore_methods
from .misc.logging import stderr, hint
from .misc.checks import check_data_folder, check_dependencies
2019-04-27 21:26:24 +00:00
from .config import (
ConfigDict,
ANSI,
IS_TTY,
DEBUG,
IN_DOCKER,
IN_QEMU,
PUID,
PGID,
2019-04-27 21:26:24 +00:00
USER,
2022-06-09 01:41:22 +00:00
TIMEZONE,
ENFORCE_ATOMIC_WRITES,
OUTPUT_PERMISSIONS,
PYTHON_BINARY,
2019-04-27 21:26:24 +00:00
ARCHIVEBOX_BINARY,
ONLY_NEW,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
LOGS_DIR,
PACKAGE_DIR,
2019-04-27 21:26:24 +00:00
CONFIG_FILE,
ARCHIVE_DIR_NAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
SQL_INDEX_FILENAME,
ALLOWED_IN_OUTPUT_DIR,
SEARCH_BACKEND_ENGINE,
LDAP,
2023-12-18 01:44:26 +00:00
get_version,
2019-04-27 21:26:24 +00:00
write_config_file,
VERSION,
VERSIONS_AVAILABLE,
2023-11-28 04:58:13 +00:00
CAN_UPGRADE,
COMMIT_HASH,
BUILD_TIME,
2019-04-27 21:26:24 +00:00
CODE_LOCATIONS,
DATA_LOCATIONS,
DEPENDENCIES,
CHROME_BINARY,
CHROME_VERSION,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
SINGLEFILE_VERSION,
READABILITY_VERSION,
MERCURY_VERSION,
NODE_VERSION,
2019-04-27 21:26:24 +00:00
load_all_config,
CONFIG,
USER_CONFIG,
ADMIN_USERNAME,
ADMIN_PASSWORD,
2019-04-27 21:26:24 +00:00
get_real_name,
setup_django,
2019-04-27 21:26:24 +00:00
)
from .logging_util import (
TERM_WIDTH,
2019-05-01 03:13:04 +00:00
TimedProgress,
2020-07-13 15:26:30 +00:00
log_importing_started,
log_crawl_started,
2019-04-27 21:26:24 +00:00
log_removal_started,
log_removal_finished,
log_list_started,
log_list_finished,
2019-05-01 03:13:04 +00:00
printable_config,
printable_folders,
printable_filesize,
printable_folder_status,
printable_dependency_version,
2019-04-27 21:26:24 +00:00
)
@enforce_types
2020-09-03 22:26:49 +00:00
def help(out_dir: Path=OUTPUT_DIR) -> None:
"""Print the ArchiveBox help message and usage"""
all_subcommands = CLI_SUBCOMMANDS
2019-04-27 21:26:24 +00:00
COMMANDS_HELP_TEXT = '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in all_subcommands.items()
if cmd in meta_cmds
) + '\n\n ' + '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in all_subcommands.items()
if cmd in main_cmds
) + '\n\n ' + '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in all_subcommands.items()
if cmd in archive_cmds
) + '\n\n ' + '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in all_subcommands.items()
if cmd not in display_first
)
2020-09-03 22:26:49 +00:00
if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
2019-04-27 21:26:24 +00:00
print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset}
{lightred}Active data directory:{reset}
{}
{lightred}Usage:{reset}
archivebox [command] [--help] [--version] [...args]
{lightred}Commands:{reset}
{}
{lightred}Example Use:{reset}
mkdir my-archive; cd my-archive/
archivebox init
archivebox status
2019-04-27 21:26:24 +00:00
archivebox add https://example.com/some/page
archivebox add --depth=1 ~/Downloads/bookmarks_export.html
archivebox list --sort=timestamp --csv=timestamp,url,is_archived
archivebox schedule --every=day https://example.com/some/feed.rss
2019-04-27 21:26:24 +00:00
archivebox update --resume=15109948213.123
{lightred}Documentation:{reset}
2020-11-23 07:04:39 +00:00
https://github.com/ArchiveBox/ArchiveBox/wiki
2019-04-27 21:26:24 +00:00
'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
else:
print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI))
print()
if IN_DOCKER:
print('When using Docker, you need to mount a volume to use as your data dir:')
print(' docker run -v /some/path:/data archivebox ...')
print()
2019-04-27 21:26:24 +00:00
print('To import an existing archive (from a previous version of ArchiveBox):')
print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
print(' 2. archivebox init')
print()
print('To start a new archive:')
print(' 1. Create an empty directory, then cd into it and run:')
print(' 2. archivebox init')
print()
print('For more information, see the documentation here:')
2020-11-23 07:04:39 +00:00
print(' https://github.com/ArchiveBox/ArchiveBox/wiki')
2019-04-27 21:26:24 +00:00
@enforce_types
def version(quiet: bool=False,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> None:
"""Print the ArchiveBox version and dependency information"""
2022-06-09 01:41:22 +00:00
print(VERSION)
if not quiet:
2023-11-14 08:21:09 +00:00
# 0.7.1
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
p = platform.uname()
print(
2023-12-18 01:44:26 +00:00
'ArchiveBox v{}'.format(get_version(CONFIG)),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={BUILD_TIME}',
)
print(
f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={IN_QEMU}',
f'ARCH={p.machine}',
f'OS={p.system}',
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}',
)
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
print(
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_USER={PUID}:{PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}',
)
print(
f'DEBUG={DEBUG}',
f'IS_TTY={IS_TTY}',
f'TZ={TIMEZONE}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
)
2019-04-27 21:26:24 +00:00
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items():
print(printable_dependency_version(name, dependency))
2022-06-09 02:57:38 +00:00
# add a newline between core dependencies and extractor dependencies for easier reading
2022-06-09 03:17:31 +00:00
if name == 'ARCHIVEBOX_BINARY':
2022-06-09 02:57:38 +00:00
print()
2019-04-27 21:26:24 +00:00
print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, path in CODE_LOCATIONS.items():
print(printable_folder_status(name, path))
2019-04-27 21:26:24 +00:00
2020-10-31 07:08:41 +00:00
print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, path in DATA_LOCATIONS.items():
print(printable_folder_status(name, path))
2020-10-31 07:08:41 +00:00
else:
print()
print('{white}[i] Data locations:{reset} (not in a data directory)'.format(**ANSI))
2019-04-27 21:26:24 +00:00
print()
check_dependencies(CONFIG)
2019-04-27 21:26:24 +00:00
@enforce_types
def run(subcommand: str,
subcommand_args: Optional[List[str]],
stdin: Optional[IO]=None,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> None:
"""Run a given ArchiveBox subcommand with the given list of args"""
2019-04-27 21:26:24 +00:00
run_subcommand(
subcommand=subcommand,
subcommand_args=subcommand_args,
stdin=stdin,
pwd=out_dir,
2019-04-27 21:26:24 +00:00
)
@enforce_types
def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
2021-02-01 10:13:46 +00:00
from core.models import Snapshot
2021-02-01 10:13:46 +00:00
out_dir.mkdir(exist_ok=True)
2019-04-27 21:26:24 +00:00
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
if (out_dir / JSON_INDEX_FILENAME).exists():
stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow")
stderr(" You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow")
existing_index = (out_dir / SQL_INDEX_FILENAME).exists()
2019-04-27 21:26:24 +00:00
if is_empty and not existing_index:
print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
2019-04-27 21:26:24 +00:00
elif existing_index:
# TODO: properly detect and print the existing version in current index as well
print('{green}[*] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
2019-04-27 21:26:24 +00:00
else:
2019-05-01 06:27:50 +00:00
if force:
stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
2019-05-01 06:27:50 +00:00
stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
else:
stderr(
("{red}[X] This folder appears to already have files in it, but no index.sqlite3 present.{reset}\n\n"
2019-05-01 06:27:50 +00:00
" You must run init in a completely empty directory, or an existing data folder.\n\n"
" {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
" then run and run 'archivebox init' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
2024-09-12 08:58:13 +00:00
).format(**ANSI)
2019-05-01 06:27:50 +00:00
)
raise SystemExit(2)
2019-04-27 21:26:24 +00:00
if existing_index:
print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
else:
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
2020-09-03 22:26:49 +00:00
Path(SOURCES_DIR).mkdir(exist_ok=True)
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
Path(LOGS_DIR).mkdir(exist_ok=True)
print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
2019-04-27 21:26:24 +00:00
write_config_file({}, out_dir=out_dir)
2021-02-16 07:49:31 +00:00
if (out_dir / SQL_INDEX_FILENAME).exists():
print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
2019-04-27 21:26:24 +00:00
else:
print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
2019-04-27 21:26:24 +00:00
DATABASE_FILE = out_dir / SQL_INDEX_FILENAME
2019-04-27 21:26:24 +00:00
for migration_line in apply_migrations(out_dir):
print(f' {migration_line}')
2020-09-03 22:26:49 +00:00
assert DATABASE_FILE.exists()
print()
print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
2019-04-27 21:26:24 +00:00
# from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
# call_command("createsuperuser", interactive=True)
print()
print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
2019-04-27 21:26:24 +00:00
all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {}
2019-04-27 21:26:24 +00:00
if existing_index:
all_links = load_main_index(out_dir=out_dir, warn=False)
print(' √ Loaded {} links from existing main index.'.format(all_links.count()))
2019-04-27 21:26:24 +00:00
2021-02-16 07:49:31 +00:00
if quick:
print(' > Skipping full snapshot directory check (quick mode)')
else:
try:
# Links in data folders that dont match their timestamp
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
if fixed:
print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
if cant_fix:
print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
# Links in JSON index but not in main index
orphaned_json_links = {
link.url: link
for link in parse_json_main_index(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_json_links:
pending_links.update(orphaned_json_links)
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link.url: link
for link in parse_json_links_details(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_data_dir_links:
pending_links.update(orphaned_data_dir_links)
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
# Links in invalid/duplicate data dirs
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
}
if invalid_folders:
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
print()
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
print(' archivebox status')
print(' archivebox list --status=invalid')
except (KeyboardInterrupt, SystemExit):
stderr()
stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
stderr()
stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
stderr(' archivebox init --quick')
raise SystemExit(1)
write_main_index(list(pending_links.values()), out_dir=out_dir)
2019-04-27 21:26:24 +00:00
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
from django.contrib.auth.models import User
2023-10-18 17:07:36 +00:00
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
2019-04-27 21:26:24 +00:00
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
2019-04-27 21:26:24 +00:00
json_index = out_dir / JSON_INDEX_FILENAME
html_index = out_dir / HTML_INDEX_FILENAME
index_name = f"{date.today()}_index_old"
if json_index.exists():
json_index.rename(f"{index_name}.json")
if html_index.exists():
html_index.rename(f"{index_name}.html")
if setup:
run_subcommand('setup', pwd=out_dir)
2021-04-06 03:21:07 +00:00
if Snapshot.objects.count() < 25: # hide the hints for experienced users
print()
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
print(' archivebox server # then visit http://127.0.0.1:8000')
print()
print(' To add new links, you can run:')
print(" archivebox add < ~/some/path/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
2019-04-27 21:26:24 +00:00
@enforce_types
2020-09-03 22:26:49 +00:00
def status(out_dir: Path=OUTPUT_DIR) -> None:
"""Print out some info and statistics about the archive collection"""
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
from core.models import Snapshot
from django.contrib.auth import get_user_model
User = get_user_model()
print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
print(ANSI['lightyellow'], f' {out_dir}/*', ANSI['reset'])
2019-04-27 21:26:24 +00:00
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
2019-05-01 03:13:04 +00:00
size = printable_filesize(num_bytes)
print(f' Index size: {size} across {num_files} files')
2019-04-27 21:26:24 +00:00
print()
links = load_main_index(out_dir=out_dir)
num_sql_links = links.count()
2019-04-27 21:26:24 +00:00
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
print()
print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset'])
2019-04-27 21:26:24 +00:00
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
2019-05-01 03:13:04 +00:00
size = printable_filesize(num_bytes)
2019-04-27 21:26:24 +00:00
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print(ANSI['black'])
2019-04-27 21:26:24 +00:00
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
num_archived = len(get_archived_folders(links, out_dir=out_dir))
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
num_present = len(get_present_folders(links, out_dir=out_dir))
num_valid = len(get_valid_folders(links, out_dir=out_dir))
print()
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
duplicate = get_duplicate_folders(links, out_dir=out_dir)
orphaned = get_orphaned_folders(links, out_dir=out_dir)
corrupted = get_corrupted_folders(links, out_dir=out_dir)
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
print(ANSI['reset'])
2019-04-27 21:26:24 +00:00
if num_indexed:
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
if orphaned:
print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
print(' archivebox init')
if num_invalid:
print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
print(' archivebox init')
print()
print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset'])
users = get_admins().values_list('username', flat=True)
print(f' UI users {len(users)}: {", ".join(users)}')
last_login = User.objects.order_by('last_login').last()
if last_login:
print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
if last_downloaded:
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
if not users:
print()
print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
print(' archivebox manage createsuperuser')
print()
for snapshot in links.order_by('-downloaded_at')[:10]:
if not snapshot.downloaded_at:
continue
print(
ANSI['black'],
(
f' > {str(snapshot.downloaded_at)[:16]} '
f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
f'"{snapshot.title}": {snapshot.url}'
)[:TERM_WIDTH()],
ANSI['reset'],
)
print(ANSI['black'], ' ...', ANSI['reset'])
2019-04-27 21:26:24 +00:00
2020-07-29 16:19:06 +00:00
@enforce_types
def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> List[Link]:
2020-07-31 15:28:30 +00:00
"""
Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
You can run this to archive single pages without needing to create a whole collection with archivebox init.
"""
oneshot_link, _ = parse_links_memory([url])
if len(oneshot_link) > 1:
stderr(
'[X] You should pass a single url to the oneshot command',
color='red'
)
raise SystemExit(2)
methods = extractors.split(",") if extractors else ignore_methods(['title'])
archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, created_by_id=created_by_id)
return oneshot_link
2020-07-29 16:19:06 +00:00
2019-04-27 21:26:24 +00:00
@enforce_types
2020-07-13 15:26:30 +00:00
def add(urls: Union[str, List[str]],
2021-03-27 07:57:05 +00:00
tag: str='',
depth: int=0,
update: bool=not ONLY_NEW,
update_all: bool=False,
2019-04-27 21:26:24 +00:00
index_only: bool=False,
overwrite: bool=False,
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
2020-08-18 12:22:34 +00:00
init: bool=False,
extractors: str="",
2021-03-20 16:38:00 +00:00
parser: str="auto",
created_by_id: int | None=None,
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
2019-04-27 21:26:24 +00:00
from core.models import Snapshot, Tag
2024-09-25 05:01:18 +00:00
# from queues.supervisor_util import start_cli_workers, tail_worker_logs
# from queues.tasks import bg_archive_link
2021-03-27 07:57:05 +00:00
2020-07-13 15:26:30 +00:00
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
2019-04-27 21:26:24 +00:00
extractors = extractors.split(",") if extractors else []
2020-08-18 12:22:34 +00:00
if init:
run_subcommand('init', stdin=None, pwd=out_dir)
2020-07-13 15:26:30 +00:00
# Load list of links from the existing index
check_data_folder(CONFIG)
check_dependencies(CONFIG)
2024-09-25 05:01:18 +00:00
# worker = start_cli_workers()
2019-04-27 21:26:24 +00:00
new_links: List[Link] = []
all_links = load_main_index(out_dir=out_dir)
2020-07-13 15:26:30 +00:00
log_importing_started(urls=urls, depth=depth, index_only=index_only)
if isinstance(urls, str):
# save verbatim stdin to sources
write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
elif isinstance(urls, list):
# save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
2022-05-10 04:21:26 +00:00
2021-03-20 16:38:00 +00:00
new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
2020-07-13 15:26:30 +00:00
# If we're going one level deeper, download each link and look for more links
2020-07-13 19:48:25 +00:00
new_links_depth = []
2020-07-13 15:26:30 +00:00
if new_links and depth == 1:
log_crawl_started(new_links)
for new_link in new_links:
try:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
except Exception as err:
stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
2021-03-27 07:57:05 +00:00
new_links = dedupe_links(all_links, imported_links)
write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
all_links = load_main_index(out_dir=out_dir)
2019-04-27 21:26:24 +00:00
tags = [
Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
for name in tag.split(',')
if name.strip()
]
if tags:
for link in imported_links:
snapshot = Snapshot.objects.get(url=link.url)
snapshot.tags.add(*tags)
snapshot.tags_str(nocache=True)
snapshot.save()
# print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
2019-04-27 21:26:24 +00:00
if index_only:
2021-03-27 07:57:05 +00:00
# mock archive all the links using the fake index_only extractor method in order to update their state
if overwrite:
archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
else:
archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
2021-03-27 07:57:05 +00:00
else:
# fully run the archive extractor methods for each link
archive_kwargs = {
"out_dir": out_dir,
"created_by_id": created_by_id,
2021-03-27 07:57:05 +00:00
}
if extractors:
archive_kwargs["methods"] = extractors
stderr()
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
if update:
2022-05-10 04:21:26 +00:00
stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
elif update_all:
2022-05-10 04:21:26 +00:00
stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
2021-03-27 07:57:05 +00:00
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
elif overwrite:
2022-05-10 04:21:26 +00:00
stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
2021-03-27 07:57:05 +00:00
archive_links(imported_links, overwrite=True, **archive_kwargs)
elif new_links:
2022-05-10 04:21:26 +00:00
stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
2021-03-27 07:57:05 +00:00
archive_links(new_links, overwrite=False, **archive_kwargs)
2024-09-25 05:01:18 +00:00
# tail_worker_logs(worker['stdout_logfile'])
2023-11-28 04:58:13 +00:00
if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
return new_links
2019-04-27 21:26:24 +00:00
@enforce_types
def remove(filter_str: Optional[str]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='exact',
snapshots: Optional[QuerySet]=None,
2019-04-27 21:26:24 +00:00
after: Optional[float]=None,
before: Optional[float]=None,
yes: bool=False,
delete: bool=False,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Remove the specified URLs from the archive"""
2019-04-27 21:26:24 +00:00
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
if snapshots is None:
2020-07-28 09:52:15 +00:00
if filter_str and filter_patterns:
stderr(
'[X] You should pass either a pattern as an argument, '
'or pass a list of patterns via stdin, but not both.\n',
color='red',
)
raise SystemExit(2)
elif not (filter_str or filter_patterns):
stderr(
'[X] You should pass either a pattern as an argument, '
'or pass a list of patterns via stdin.',
color='red',
)
stderr()
2020-08-18 12:22:34 +00:00
hint(('To remove all urls you can run:',
'archivebox remove --filter-type=regex ".*"'))
2020-07-28 09:52:15 +00:00
stderr()
raise SystemExit(2)
elif filter_str:
filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
list_kwargs = {
"filter_patterns": filter_patterns,
"filter_type": filter_type,
"after": after,
"before": before,
}
if snapshots:
list_kwargs["snapshots"] = snapshots
log_list_started(filter_patterns, filter_type)
timer = TimedProgress(360, prefix=' ')
try:
snapshots = list_links(**list_kwargs)
finally:
timer.end()
2020-07-28 09:52:15 +00:00
2019-04-27 21:26:24 +00:00
if not snapshots.exists():
2019-04-27 21:26:24 +00:00
log_removal_finished(0, 0)
raise SystemExit(1)
log_links = [link.as_link() for link in snapshots]
log_list_finished(log_links)
log_removal_started(log_links, yes=yes, delete=delete)
2019-04-27 21:26:24 +00:00
timer = TimedProgress(360, prefix=' ')
try:
for snapshot in snapshots:
if delete:
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
2019-04-27 21:26:24 +00:00
finally:
timer.end()
to_remove = snapshots.count()
from .search import flush_search_index
flush_search_index(snapshots=snapshots)
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
all_snapshots = load_main_index(out_dir=out_dir)
log_removal_finished(all_snapshots.count(), to_remove)
2019-04-27 21:26:24 +00:00
return all_snapshots
2019-04-27 21:26:24 +00:00
@enforce_types
def update(resume: Optional[float]=None,
only_new: bool=ONLY_NEW,
2019-04-27 21:26:24 +00:00
index_only: bool=False,
overwrite: bool=False,
filter_patterns_str: Optional[str]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: Optional[str]=None,
status: Optional[str]=None,
after: Optional[str]=None,
before: Optional[str]=None,
extractors: str="",
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
2019-04-27 21:26:24 +00:00
from core.models import ArchiveResult
from .search import index_links
2024-09-25 05:01:18 +00:00
# from .queues.supervisor_util import start_cli_workers
check_data_folder(CONFIG)
check_dependencies(CONFIG)
2024-09-25 05:01:18 +00:00
# start_cli_workers()
new_links: List[Link] = [] # TODO: Remove input argument: only_new
2019-04-27 21:26:24 +00:00
extractors = extractors.split(",") if extractors else []
# Step 1: Filter for selected_links
print('[*] Finding matching Snapshots to update...')
print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
matching_snapshots = list_links(
2019-04-27 21:26:24 +00:00
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
)
print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
2019-04-27 21:26:24 +00:00
matching_folders = list_folders(
links=matching_snapshots,
2019-04-27 21:26:24 +00:00
status=status,
out_dir=out_dir,
)
all_links = (link for link in matching_folders.values() if link)
print(' - Sorting by most unfinished -> least unfinished + date archived...')
all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
2019-04-27 21:26:24 +00:00
if index_only:
for link in all_links:
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
index_links(all_links, out_dir=out_dir)
2019-04-27 21:26:24 +00:00
return all_links
# Step 2: Run the archive methods for each link
2020-07-13 15:26:30 +00:00
to_archive = new_links if only_new else all_links
2020-08-18 08:39:39 +00:00
if resume:
to_archive = [
link for link in to_archive
if link.timestamp >= str(resume)
]
if not to_archive:
stderr('')
stderr(f'[√] Nothing found to resume after {resume}', color='green')
2020-08-18 08:39:39 +00:00
return all_links
archive_kwargs = {
"out_dir": out_dir,
}
if extractors:
archive_kwargs["methods"] = extractors
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
2019-04-27 21:26:24 +00:00
# Step 4: Re-write links index with updated titles, icons, and resources
all_links = load_main_index(out_dir=out_dir)
2019-04-27 21:26:24 +00:00
return all_links
@enforce_types
def list_all(filter_patterns_str: Optional[str]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='exact',
status: Optional[str]=None,
after: Optional[float]=None,
before: Optional[float]=None,
sort: Optional[str]=None,
csv: Optional[str]=None,
json: bool=False,
2020-08-19 18:02:12 +00:00
html: bool=False,
with_headers: bool=False,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> Iterable[Link]:
"""List, filter, and export information about archive entries"""
2019-04-27 21:26:24 +00:00
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
if filter_patterns and filter_patterns_str:
stderr(
'[X] You should either pass filter patterns as an arguments '
'or via stdin, but not both.\n',
color='red',
)
raise SystemExit(2)
elif filter_patterns_str:
filter_patterns = filter_patterns_str.split('\n')
snapshots = list_links(
2019-04-27 21:26:24 +00:00
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
)
if sort:
snapshots = snapshots.order_by(sort)
2019-04-27 21:26:24 +00:00
folders = list_folders(
links=snapshots,
2019-04-27 21:26:24 +00:00
status=status,
out_dir=out_dir,
)
if json:
output = generate_json_index_from_links(folders.values(), with_headers)
elif html:
output = generate_index_from_links(folders.values(), with_headers)
elif csv:
output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers)
else:
output = printable_folders(folders, with_headers=with_headers)
print(output)
2019-05-01 03:13:04 +00:00
return folders
2019-04-27 21:26:24 +00:00
@enforce_types
def list_links(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
2019-04-27 21:26:24 +00:00
filter_type: str='exact',
after: Optional[float]=None,
before: Optional[float]=None,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> Iterable[Link]:
2019-04-27 21:26:24 +00:00
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
if snapshots:
all_snapshots = snapshots
else:
all_snapshots = load_main_index(out_dir=out_dir)
if after is not None:
all_snapshots = all_snapshots.filter(timestamp__gte=after)
if before is not None:
all_snapshots = all_snapshots.filter(timestamp__lt=before)
if filter_patterns:
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
if not all_snapshots:
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
return all_snapshots
2019-04-27 21:26:24 +00:00
@enforce_types
def list_folders(links: List[Link],
status: str,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
2019-04-27 21:26:24 +00:00
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
STATUS_FUNCTIONS = {
"indexed": get_indexed_folders,
"archived": get_archived_folders,
"unarchived": get_unarchived_folders,
"present": get_present_folders,
"valid": get_valid_folders,
"invalid": get_invalid_folders,
"duplicate": get_duplicate_folders,
"orphaned": get_orphaned_folders,
"corrupted": get_corrupted_folders,
"unrecognized": get_unrecognized_folders,
}
try:
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
except KeyError:
raise ValueError('Status not recognized.')
2019-04-27 21:26:24 +00:00
@enforce_types
def setup(out_dir: Path=OUTPUT_DIR) -> None:
"""Automatically install all ArchiveBox dependencies and extras"""
if not (out_dir / ARCHIVE_DIR_NAME).exists():
run_subcommand('init', stdin=None, pwd=out_dir)
setup_django(out_dir=out_dir, check_db=True)
2024-08-28 10:01:35 +00:00
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exists():
stderr('\n[+] Creating new admin user for the Web UI...', color='green')
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green')
stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...')
if YOUTUBEDL_VERSION:
print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY)
else:
try:
run_shell([
PYTHON_BINARY, '-m', 'pip',
'install',
'--upgrade',
'--no-cache-dir',
'--no-warn-script-location',
2024-08-28 10:46:45 +00:00
'yt-dlp',
], capture_output=False, cwd=out_dir)
pkg_path = run_shell([
PYTHON_BINARY, '-m', 'pip',
'show',
2024-08-28 10:46:45 +00:00
'yt-dlp',
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
2024-08-28 10:46:45 +00:00
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
2024-08-28 10:46:45 +00:00
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir)
2021-06-01 00:59:05 +00:00
except BaseException as e: # lgtm [py/catch-base-exception]
stderr(f'[X] Failed to install python packages: {e}', color='red')
raise SystemExit(1)
2023-10-26 20:59:25 +00:00
if platform.machine() == 'armv7l':
stderr('\n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7.')
else:
2023-10-26 20:59:25 +00:00
stderr('\n Installing CHROME_BINARY automatically using playwright...')
if CHROME_VERSION:
print(f'{CHROME_VERSION} is already installed', CHROME_BINARY)
else:
try:
run_shell([
PYTHON_BINARY, '-m', 'pip',
'install',
'--upgrade',
'--no-cache-dir',
'--no-warn-script-location',
'playwright',
], capture_output=False, cwd=out_dir)
run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir)
proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir)
NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip()
assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path'
config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir)
except BaseException as e: # lgtm [py/catch-base-exception]
stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red')
raise SystemExit(1)
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
if not NODE_VERSION:
stderr('[X] You must first install node & npm using your system package manager', color='red')
hint([
'https://github.com/nodesource/distributions#table-of-contents',
'or to disable all node-based modules run: archivebox config --set USE_NODE=False',
])
raise SystemExit(1)
if all((SINGLEFILE_VERSION, READABILITY_VERSION, MERCURY_VERSION)):
print('SINGLEFILE_BINARY, READABILITY_BINARY, and MERCURURY_BINARY are already installed')
else:
try:
# clear out old npm package locations
paths = (
out_dir / 'package.json',
out_dir / 'package_lock.json',
out_dir / 'node_modules',
)
for path in paths:
if path.is_dir():
shutil.rmtree(path, ignore_errors=True)
elif path.is_file():
os.remove(path)
2021-06-01 07:03:42 +00:00
shutil.copyfile(PACKAGE_DIR / 'package.json', out_dir / 'package.json') # copy the js requirements list from the source install into the data dir
# lets blindly assume that calling out to npm via shell works reliably cross-platform 🤡 (until proven otherwise via support tickets)
run_shell([
'npm',
'install',
2021-06-01 07:03:42 +00:00
'--prefix', str(out_dir), # force it to put the node_modules dir in this folder
'--force', # overwrite any existing node_modules
'--no-save', # don't bother saving updating the package.json or package-lock.json file
'--no-audit', # don't bother checking for newer versions with security vuln fixes
'--no-fund', # hide "please fund our project" messages
'--loglevel', 'error', # only show erros (hide warn/info/debug) during installation
# these args are written in blood, change with caution
], capture_output=False, cwd=out_dir)
os.remove(out_dir / 'package.json')
2021-06-01 00:59:05 +00:00
except BaseException as e: # lgtm [py/catch-base-exception]
stderr(f'[X] Failed to install npm packages: {e}', color='red')
hint(f'Try deleting {out_dir}/node_modules and running it again')
raise SystemExit(1)
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
run_shell([PYTHON_BINARY, ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir)
2019-04-27 21:26:24 +00:00
@enforce_types
2019-04-27 21:26:24 +00:00
def config(config_options_str: Optional[str]=None,
config_options: Optional[List[str]]=None,
get: bool=False,
set: bool=False,
reset: bool=False,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> None:
"""Get and set your ArchiveBox project configuration values"""
2019-04-27 21:26:24 +00:00
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
if config_options and config_options_str:
stderr(
'[X] You should either pass config values as an arguments '
'or via stdin, but not both.\n',
color='red',
)
raise SystemExit(2)
elif config_options_str:
2019-05-01 03:13:04 +00:00
config_options = config_options_str.split('\n')
2019-04-27 21:26:24 +00:00
config_options = config_options or []
no_args = not (get or set or reset or config_options)
matching_config: ConfigDict = {}
if get or no_args:
if config_options:
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
failed_config = [key for key in config_options if key not in CONFIG]
if failed_config:
stderr()
stderr('[X] These options failed to get', color='red')
stderr(' {}'.format('\n '.join(config_options)))
raise SystemExit(1)
else:
matching_config = CONFIG
print(printable_config(matching_config))
raise SystemExit(not matching_config)
elif set:
new_config = {}
failed_options = []
for line in config_options:
if line.startswith('#') or not line.strip():
continue
if '=' not in line:
stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
stderr(f' {line}')
raise SystemExit(2)
raw_key, val = line.split('=', 1)
2019-04-27 21:26:24 +00:00
raw_key = raw_key.upper().strip()
key = get_real_name(raw_key)
if key != raw_key:
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
if key in CONFIG:
new_config[key] = val.strip()
else:
failed_options.append(line)
if new_config:
before = CONFIG
matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
after = load_all_config()
print(printable_config(matching_config))
side_effect_changes: ConfigDict = {}
for key, val in after.items():
if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
side_effect_changes[key] = after[key]
if side_effect_changes:
stderr()
stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
if failed_options:
stderr()
2020-06-30 06:04:16 +00:00
stderr('[X] These options failed to set (check for typos):', color='red')
2019-04-27 21:26:24 +00:00
stderr(' {}'.format('\n '.join(failed_options)))
raise SystemExit(1)
2019-04-27 21:26:24 +00:00
elif reset:
stderr('[X] This command is not implemented yet.', color='red')
stderr(' Please manually remove the relevant lines from your config file:')
stderr(f' {CONFIG_FILE}')
raise SystemExit(2)
else:
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
stderr(' archivebox config')
stderr(' archivebox config --get SOME_KEY')
stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
raise SystemExit(2)
@enforce_types
def schedule(add: bool=False,
show: bool=False,
clear: bool=False,
foreground: bool=False,
run_all: bool=False,
quiet: bool=False,
every: Optional[str]=None,
2024-01-04 04:31:14 +00:00
tag: str='',
2020-08-18 05:58:54 +00:00
depth: int=0,
overwrite: bool=False,
update: bool=not ONLY_NEW,
2019-04-27 21:26:24 +00:00
import_path: Optional[str]=None,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR):
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
2019-04-27 21:26:24 +00:00
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
Path(LOGS_DIR).mkdir(exist_ok=True)
2019-04-27 21:26:24 +00:00
cron = CronTab(user=True)
2019-05-01 03:13:04 +00:00
cron = dedupe_cron_jobs(cron)
2019-04-27 21:26:24 +00:00
2020-08-18 08:39:58 +00:00
if clear:
print(cron.remove_all(comment=CRON_COMMENT))
cron.write()
raise SystemExit(0)
2019-04-27 21:26:24 +00:00
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if every or add:
every = every or 'day'
quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s)
2019-04-27 21:26:24 +00:00
cmd = [
'cd',
quoted(out_dir),
'&&',
quoted(ARCHIVEBOX_BINARY),
*([
'add',
*(['--overwrite'] if overwrite else []),
*(['--update'] if update else []),
2024-01-04 04:31:14 +00:00
*([f'--tag={tag}'] if tag else []),
f'--depth={depth}',
f'"{import_path}"',
] if import_path else ['update']),
'>>',
quoted(Path(LOGS_DIR) / 'schedule.log'),
2020-08-18 08:39:58 +00:00
'2>&1',
2019-04-27 21:26:24 +00:00
]
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
if every in ('minute', 'hour', 'day', 'month', 'year'):
2019-04-27 21:26:24 +00:00
set_every = getattr(new_job.every(), every)
set_every()
elif CronSlices.is_valid(every):
new_job.setall(every)
else:
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
stderr(' It must be one of minute/hour/day/month')
2019-04-27 21:26:24 +00:00
stderr(' or a quoted cron-format schedule like:')
stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml')
2019-04-27 21:26:24 +00:00
raise SystemExit(1)
2019-05-01 03:13:04 +00:00
cron = dedupe_cron_jobs(cron)
2019-04-27 21:26:24 +00:00
cron.write()
total_runs = sum(j.frequency_per_year() for j in cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not quiet:
stderr()
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
2020-07-13 15:22:07 +00:00
stderr(' Congrats on being an enthusiastic internet archiver! 👌')
2019-04-27 21:26:24 +00:00
stderr()
stderr(' Make sure you have enough storage space available to hold all the data.')
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
2020-08-18 05:59:04 +00:00
stderr('')
elif show:
if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs))
else:
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
2020-08-18 05:59:04 +00:00
raise SystemExit(0)
cron = CronTab(user=True)
cron = dedupe_cron_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if foreground or run_all:
if not existing_jobs:
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
2020-08-18 05:59:04 +00:00
raise SystemExit(1)
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
if run_all:
try:
for job in existing_jobs:
2020-08-18 08:39:58 +00:00
sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n')
sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
2020-08-18 05:59:04 +00:00
sys.stdout.flush()
job.run()
2020-08-18 08:39:58 +00:00
sys.stdout.write(f'\r{job.command.split("/archivebox ")[-1]}\n')
2020-08-18 05:59:04 +00:00
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
2019-04-27 21:26:24 +00:00
2020-08-18 05:59:04 +00:00
if foreground:
try:
for job in existing_jobs:
print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
2020-08-18 05:59:04 +00:00
for result in cron.run_scheduler():
print(result)
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
2023-11-28 04:58:13 +00:00
if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
2023-11-28 04:58:13 +00:00
2020-08-18 05:59:04 +00:00
@enforce_types
def server(runserver_args: Optional[List[str]]=None,
reload: bool=False,
debug: bool=False,
init: bool=False,
quick_init: bool=False,
createsuperuser: bool=False,
2020-09-03 22:26:49 +00:00
out_dir: Path=OUTPUT_DIR) -> None:
"""Run the ArchiveBox HTTP server"""
2019-04-27 21:26:24 +00:00
runserver_args = runserver_args or []
if init:
run_subcommand('init', stdin=None, pwd=out_dir)
2021-02-16 07:49:31 +00:00
print()
elif quick_init:
run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
2021-02-16 07:49:31 +00:00
print()
if createsuperuser:
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
2021-02-16 07:49:31 +00:00
print()
# setup config for django runserver
from . import config
config.SHOW_PROGRESS = False
config.DEBUG = config.DEBUG or debug
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
from django.core.management import call_command
from django.contrib.auth.models import User
print('{green}[+] Starting ArchiveBox webserver... {reset}'.format(**ANSI))
2021-02-16 07:49:31 +00:00
print(' > Logging errors to ./logs/errors.log')
if not User.objects.filter(is_superuser=True).exists():
2019-04-27 21:26:24 +00:00
print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
print()
print(' To create an admin user, run:')
print(' archivebox manage createsuperuser')
print()
# toggle autoreloading when archivebox code changes
config.SHOW_PROGRESS = False
config.DEBUG = config.DEBUG or debug
if debug:
if not reload:
runserver_args.append('--noreload') # '--insecure'
call_command("runserver", *runserver_args)
else:
host = '127.0.0.1'
port = '8000'
try:
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
if ':' in host_and_port:
host, port = host_and_port.split(':')
else:
if '.' in host_and_port:
host = host_and_port
else:
port = host_and_port
except IndexError:
pass
print(f' > Starting ArchiveBox webserver on http://{host}:{port}/')
from queues.supervisor_util import get_or_create_supervisord_process, start_worker, stop_worker, watch_worker
print()
supervisor = get_or_create_supervisord_process(daemonize=False)
bg_workers = [
{
"name": "worker_system_tasks",
"command": "archivebox manage djangohuey --queue system_tasks",
"autostart": "true",
"autorestart": "true",
"stdout_logfile": "logs/worker_system_tasks.log",
"redirect_stderr": "true",
},
]
fg_worker = {
"name": "worker_daphne",
"command": f"daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
"autostart": "false",
"autorestart": "true",
"stdout_logfile": "logs/worker_daphne.log",
"redirect_stderr": "true",
}
print()
for worker in bg_workers:
start_worker(supervisor, worker)
print()
start_worker(supervisor, fg_worker)
print()
try:
watch_worker(supervisor, "worker_daphne")
except KeyboardInterrupt:
print("\n[🛑] Got Ctrl+C, stopping gracefully...")
except SystemExit:
pass
except BaseException as e:
print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
raise
finally:
stop_worker(supervisor, "worker_daphne")
time.sleep(0.5)
print("\n[🟩] ArchiveBox server shut down gracefully.")
2019-04-27 21:26:24 +00:00
@enforce_types
2020-09-03 22:26:49 +00:00
def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
"""Run an ArchiveBox Django management command"""
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
2024-05-06 13:58:03 +00:00
stderr('')
2019-04-27 21:26:24 +00:00
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
2019-05-01 03:13:04 +00:00
@enforce_types
2020-09-03 22:26:49 +00:00
def shell(out_dir: Path=OUTPUT_DIR) -> None:
2019-05-01 03:13:04 +00:00
"""Enter an interactive ArchiveBox Django shell"""
check_data_folder(CONFIG)
2019-04-27 21:26:24 +00:00
from django.core.management import call_command
call_command("shell_plus")
2020-07-29 16:19:06 +00:00