2019-04-27 21:26:24 +00:00
__package__ = ' archivebox '
import os
2024-09-10 07:04:39 +00:00
import time
2019-04-27 21:26:24 +00:00
import sys
import shutil
2020-11-28 03:59:18 +00:00
import platform
2024-09-06 09:55:06 +00:00
from typing import Dict , List , Optional , Iterable , IO , Union
2020-08-18 16:45:27 +00:00
from pathlib import Path
2022-05-10 03:15:55 +00:00
from datetime import date , datetime
2019-04-27 21:26:24 +00:00
from crontab import CronTab , CronSlices
2024-09-06 09:55:06 +00:00
2020-08-21 17:42:08 +00:00
from django . db . models import QuerySet
2024-09-06 09:55:06 +00:00
from django . utils import timezone
2019-04-27 21:26:24 +00:00
from . cli import (
2024-09-25 02:04:38 +00:00
CLI_SUBCOMMANDS ,
2019-04-27 21:26:24 +00:00
run_subcommand ,
display_first ,
meta_cmds ,
main_cmds ,
archive_cmds ,
)
2019-05-01 03:13:04 +00:00
from . parsers import (
2020-07-13 15:26:30 +00:00
save_text_as_source ,
save_file_as_source ,
2020-07-29 16:19:06 +00:00
parse_links_memory ,
2019-04-27 21:26:24 +00:00
)
2019-05-01 03:13:04 +00:00
from . index . schema import Link
2020-07-24 17:25:25 +00:00
from . util import enforce_types # type: ignore
2019-05-01 03:13:04 +00:00
from . system import get_dir_size , dedupe_cron_jobs , CRON_COMMENT
2021-04-05 23:51:57 +00:00
from . system import run as run_shell
2019-04-27 21:26:24 +00:00
from . index import (
load_main_index ,
2020-07-13 15:26:30 +00:00
parse_links_from_source ,
dedupe_links ,
2019-04-27 21:26:24 +00:00
write_main_index ,
2020-08-21 17:42:08 +00:00
snapshot_filter ,
2019-04-27 21:26:24 +00:00
get_indexed_folders ,
get_archived_folders ,
get_unarchived_folders ,
get_present_folders ,
get_valid_folders ,
get_invalid_folders ,
get_duplicate_folders ,
get_orphaned_folders ,
get_corrupted_folders ,
get_unrecognized_folders ,
fix_invalid_folder_locations ,
2020-12-05 17:10:17 +00:00
write_link_details ,
2019-04-27 21:26:24 +00:00
)
from . index . json import (
parse_json_main_index ,
parse_json_links_details ,
2020-11-28 17:28:39 +00:00
generate_json_index_from_links ,
2019-04-27 21:26:24 +00:00
)
2019-05-01 03:13:04 +00:00
from . index . sql import (
get_admins ,
apply_migrations ,
2020-07-23 20:07:00 +00:00
remove_from_sql_main_index ,
2019-05-01 03:13:04 +00:00
)
2020-11-28 17:28:39 +00:00
from . index . html import (
generate_index_from_links ,
)
2020-11-28 18:11:15 +00:00
from . index . csv import links_to_csv
2020-07-31 15:24:58 +00:00
from . extractors import archive_links , archive_link , ignore_methods
2024-09-25 02:04:38 +00:00
from . misc . logging import stderr , hint
from . misc . checks import check_data_folder , check_dependencies
2019-04-27 21:26:24 +00:00
from . config import (
ConfigDict ,
ANSI ,
2020-08-10 18:15:53 +00:00
IS_TTY ,
2021-02-16 06:23:58 +00:00
DEBUG ,
2020-08-10 18:15:53 +00:00
IN_DOCKER ,
2023-10-31 07:23:19 +00:00
IN_QEMU ,
2022-06-09 01:24:58 +00:00
PUID ,
PGID ,
2019-04-27 21:26:24 +00:00
USER ,
2022-06-09 01:41:22 +00:00
TIMEZONE ,
2022-06-09 01:24:58 +00:00
ENFORCE_ATOMIC_WRITES ,
OUTPUT_PERMISSIONS ,
2021-04-05 23:51:57 +00:00
PYTHON_BINARY ,
2019-04-27 21:26:24 +00:00
ARCHIVEBOX_BINARY ,
ONLY_NEW ,
OUTPUT_DIR ,
SOURCES_DIR ,
ARCHIVE_DIR ,
LOGS_DIR ,
2021-04-05 23:51:57 +00:00
PACKAGE_DIR ,
2019-04-27 21:26:24 +00:00
CONFIG_FILE ,
ARCHIVE_DIR_NAME ,
JSON_INDEX_FILENAME ,
HTML_INDEX_FILENAME ,
SQL_INDEX_FILENAME ,
2021-04-12 21:06:32 +00:00
ALLOWED_IN_OUTPUT_DIR ,
2021-02-16 06:23:58 +00:00
SEARCH_BACKEND_ENGINE ,
2023-12-18 00:57:02 +00:00
LDAP ,
2023-12-18 01:44:26 +00:00
get_version ,
2019-04-27 21:26:24 +00:00
write_config_file ,
VERSION ,
2023-12-19 18:01:08 +00:00
VERSIONS_AVAILABLE ,
2023-11-28 04:58:13 +00:00
CAN_UPGRADE ,
2022-06-09 03:13:22 +00:00
COMMIT_HASH ,
2023-12-18 00:57:02 +00:00
BUILD_TIME ,
2019-04-27 21:26:24 +00:00
CODE_LOCATIONS ,
DATA_LOCATIONS ,
DEPENDENCIES ,
2021-04-05 23:51:57 +00:00
CHROME_BINARY ,
CHROME_VERSION ,
2021-04-06 01:15:32 +00:00
YOUTUBEDL_BINARY ,
YOUTUBEDL_VERSION ,
SINGLEFILE_VERSION ,
READABILITY_VERSION ,
MERCURY_VERSION ,
2021-04-05 23:51:57 +00:00
NODE_VERSION ,
2019-04-27 21:26:24 +00:00
load_all_config ,
CONFIG ,
USER_CONFIG ,
2023-10-18 08:07:54 +00:00
ADMIN_USERNAME ,
ADMIN_PASSWORD ,
2019-04-27 21:26:24 +00:00
get_real_name ,
2021-04-06 01:15:32 +00:00
setup_django ,
2019-04-27 21:26:24 +00:00
)
2020-07-22 16:02:13 +00:00
from . logging_util import (
2020-06-26 03:32:01 +00:00
TERM_WIDTH ,
2019-05-01 03:13:04 +00:00
TimedProgress ,
2020-07-13 15:26:30 +00:00
log_importing_started ,
log_crawl_started ,
2019-04-27 21:26:24 +00:00
log_removal_started ,
log_removal_finished ,
log_list_started ,
log_list_finished ,
2019-05-01 03:13:04 +00:00
printable_config ,
printable_folders ,
printable_filesize ,
printable_folder_status ,
printable_dependency_version ,
2019-04-27 21:26:24 +00:00
)
2021-04-12 21:06:32 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
2020-09-03 22:26:49 +00:00
def help ( out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Print the ArchiveBox help message and usage """
2024-09-25 02:04:38 +00:00
all_subcommands = CLI_SUBCOMMANDS
2019-04-27 21:26:24 +00:00
COMMANDS_HELP_TEXT = ' \n ' . join (
f ' { cmd . ljust ( 20 ) } { summary } '
for cmd , summary in all_subcommands . items ( )
if cmd in meta_cmds
) + ' \n \n ' + ' \n ' . join (
f ' { cmd . ljust ( 20 ) } { summary } '
for cmd , summary in all_subcommands . items ( )
if cmd in main_cmds
) + ' \n \n ' + ' \n ' . join (
f ' { cmd . ljust ( 20 ) } { summary } '
for cmd , summary in all_subcommands . items ( )
if cmd in archive_cmds
) + ' \n \n ' + ' \n ' . join (
f ' { cmd . ljust ( 20 ) } { summary } '
for cmd , summary in all_subcommands . items ( )
if cmd not in display_first
)
2020-09-03 22:26:49 +00:00
if ( Path ( out_dir ) / SQL_INDEX_FILENAME ) . exists ( ) :
2019-04-27 21:26:24 +00:00
print ( ''' {green} ArchiveBox v {} : The self-hosted internet archive. {reset}
{ lightred } Active data directory : { reset }
{ }
{ lightred } Usage : { reset }
archivebox [ command ] [ - - help ] [ - - version ] [ . . . args ]
{ lightred } Commands : { reset }
{ }
{ lightred } Example Use : { reset }
mkdir my - archive ; cd my - archive /
archivebox init
2020-06-26 03:32:01 +00:00
archivebox status
2019-04-27 21:26:24 +00:00
archivebox add https : / / example . com / some / page
archivebox add - - depth = 1 ~ / Downloads / bookmarks_export . html
archivebox list - - sort = timestamp - - csv = timestamp , url , is_archived
2020-08-18 08:32:36 +00:00
archivebox schedule - - every = day https : / / example . com / some / feed . rss
2019-04-27 21:26:24 +00:00
archivebox update - - resume = 15109948213.123
{ lightred } Documentation : { reset }
2020-11-23 07:04:39 +00:00
https : / / github . com / ArchiveBox / ArchiveBox / wiki
2019-04-27 21:26:24 +00:00
''' .format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
else :
print ( ' {green} Welcome to ArchiveBox v {} ! {reset} ' . format ( VERSION , * * ANSI ) )
print ( )
2020-08-10 18:15:53 +00:00
if IN_DOCKER :
print ( ' When using Docker, you need to mount a volume to use as your data dir: ' )
print ( ' docker run -v /some/path:/data archivebox ... ' )
print ( )
2019-04-27 21:26:24 +00:00
print ( ' To import an existing archive (from a previous version of ArchiveBox): ' )
print ( ' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run: ' )
print ( ' 2. archivebox init ' )
print ( )
print ( ' To start a new archive: ' )
print ( ' 1. Create an empty directory, then cd into it and run: ' )
print ( ' 2. archivebox init ' )
print ( )
print ( ' For more information, see the documentation here: ' )
2020-11-23 07:04:39 +00:00
print ( ' https://github.com/ArchiveBox/ArchiveBox/wiki ' )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
def version ( quiet : bool = False ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Print the ArchiveBox version and dependency information """
2022-06-09 01:41:22 +00:00
2022-06-09 02:46:09 +00:00
print ( VERSION )
if not quiet :
2023-11-14 08:21:09 +00:00
# 0.7.1
2023-12-18 00:57:02 +00:00
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
2022-06-09 02:46:09 +00:00
2020-11-28 03:59:18 +00:00
p = platform . uname ( )
2021-02-16 06:23:58 +00:00
print (
2023-12-18 01:44:26 +00:00
' ArchiveBox v {} ' . format ( get_version ( CONFIG ) ) ,
Fix quotation
Fixes:
=> ERROR [stage-0 22/23] RUN "/app"/bin/docker_entrypoint.sh version 2>&1 | tee -a /VERSION.txt 1.7s
------
> [stage-0 22/23] RUN "/app"/bin/docker_entrypoint.sh version 2>&1 | tee -a /VERSION.txt:
1.665 Traceback (most recent call last):
1.665 File "/usr/local/bin/archivebox", line 5, in <module>
1.665 from archivebox.cli import main
1.665 File "/app/archivebox/cli/__init__.py", line 83, in <module>
1.665 SUBCOMMANDS = list_subcommands()
1.665 ^^^^^^^^^^^^^^^^^^
1.665 File "/app/archivebox/cli/__init__.py", line 43, in list_subcommands
1.665 module = import_module('.archivebox_{}'.format(subcommand), __package__)
1.665 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1.665 File "/usr/local/lib/python3.11/importlib/__init__.py", line 126, in import_module
1.665 return _bootstrap._gcd_import(name[level:], package, level)
1.665 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1.666 File "/app/archivebox/cli/archivebox_add.py", line 11, in <module>
1.666 from ..main import add
1.666 File "/app/archivebox/main.py", line 233
1.666 f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}',
1.666 ^^^^^^^
1.666 SyntaxError: f-string: expecting '}'
2024-05-06 19:04:14 +00:00
f ' COMMIT_HASH= { COMMIT_HASH [ : 7 ] if COMMIT_HASH else " unknown " } ' ,
2023-12-18 00:57:02 +00:00
f ' BUILD_TIME= { BUILD_TIME } ' ,
2021-02-16 06:23:58 +00:00
)
print (
2022-06-09 02:46:09 +00:00
f ' IN_DOCKER= { IN_DOCKER } ' ,
2023-10-31 07:23:19 +00:00
f ' IN_QEMU= { IN_QEMU } ' ,
2023-12-18 00:57:02 +00:00
f ' ARCH= { p . machine } ' ,
f ' OS= { p . system } ' ,
f ' PLATFORM= { platform . platform ( ) } ' ,
f ' PYTHON= { sys . implementation . name . title ( ) } ' ,
)
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS [ ' OUTPUT_DIR ' ] [ ' is_mount ' ] or DATA_LOCATIONS [ ' ARCHIVE_DIR ' ] [ ' is_mount ' ]
print (
2022-06-09 02:46:09 +00:00
f ' FS_ATOMIC= { ENFORCE_ATOMIC_WRITES } ' ,
f ' FS_REMOTE= { OUTPUT_IS_REMOTE_FS } ' ,
2023-10-20 09:47:23 +00:00
f ' FS_USER= { PUID } : { PGID } ' ,
f ' FS_PERMS= { OUTPUT_PERMISSIONS } ' ,
2023-12-18 00:57:02 +00:00
)
print (
f ' DEBUG= { DEBUG } ' ,
f ' IS_TTY= { IS_TTY } ' ,
f ' TZ= { TIMEZONE } ' ,
2022-06-09 02:46:09 +00:00
f ' SEARCH_BACKEND= { SEARCH_BACKEND_ENGINE } ' ,
2023-12-18 00:57:02 +00:00
f ' LDAP= { LDAP } ' ,
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
2021-02-16 06:23:58 +00:00
)
2019-04-27 21:26:24 +00:00
print ( )
print ( ' {white} [i] Dependency versions: {reset} ' . format ( * * ANSI ) )
for name , dependency in DEPENDENCIES . items ( ) :
2019-05-01 03:10:48 +00:00
print ( printable_dependency_version ( name , dependency ) )
2022-06-09 02:57:38 +00:00
# add a newline between core dependencies and extractor dependencies for easier reading
2022-06-09 03:17:31 +00:00
if name == ' ARCHIVEBOX_BINARY ' :
2022-06-09 02:57:38 +00:00
print ( )
2019-04-27 21:26:24 +00:00
print ( )
2020-10-31 07:08:03 +00:00
print ( ' {white} [i] Source-code locations: {reset} ' . format ( * * ANSI ) )
2023-10-20 09:47:23 +00:00
for name , path in CODE_LOCATIONS . items ( ) :
print ( printable_folder_status ( name , path ) )
2019-04-27 21:26:24 +00:00
2020-10-31 07:08:41 +00:00
print ( )
2020-08-14 03:21:57 +00:00
if DATA_LOCATIONS [ ' OUTPUT_DIR ' ] [ ' is_valid ' ] :
print ( ' {white} [i] Data locations: {reset} ' . format ( * * ANSI ) )
2023-10-20 09:47:23 +00:00
for name , path in DATA_LOCATIONS . items ( ) :
print ( printable_folder_status ( name , path ) )
2020-10-31 07:08:41 +00:00
else :
print ( )
2023-12-18 00:57:02 +00:00
print ( ' {white} [i] Data locations: {reset} (not in a data directory) ' . format ( * * ANSI ) )
2019-04-27 21:26:24 +00:00
print ( )
2024-09-25 02:04:38 +00:00
check_dependencies ( CONFIG )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
def run ( subcommand : str ,
subcommand_args : Optional [ List [ str ] ] ,
stdin : Optional [ IO ] = None ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Run a given ArchiveBox subcommand with the given list of args """
2019-04-27 21:26:24 +00:00
run_subcommand (
subcommand = subcommand ,
subcommand_args = subcommand_args ,
stdin = stdin ,
2019-05-01 03:10:48 +00:00
pwd = out_dir ,
2019-04-27 21:26:24 +00:00
)
2019-05-01 03:10:48 +00:00
@enforce_types
2021-04-06 01:15:32 +00:00
def init ( force : bool = False , quick : bool = False , setup : bool = False , out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Initialize a new ArchiveBox collection in the current directory """
2021-02-01 10:13:46 +00:00
2020-12-11 21:27:15 +00:00
from core . models import Snapshot
2021-02-01 10:13:46 +00:00
2021-04-06 01:15:32 +00:00
out_dir . mkdir ( exist_ok = True )
2019-04-27 21:26:24 +00:00
is_empty = not len ( set ( os . listdir ( out_dir ) ) - ALLOWED_IN_OUTPUT_DIR )
2020-08-18 16:45:27 +00:00
2021-04-06 01:15:32 +00:00
if ( out_dir / JSON_INDEX_FILENAME ) . exists ( ) :
2020-09-08 15:53:11 +00:00
stderr ( " [!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically. " , color = " lightyellow " )
2021-04-12 21:06:32 +00:00
stderr ( " You can run `archivebox list --json --with-headers > static_index.json` to manually generate it. " , color = " lightyellow " )
2020-09-08 15:53:11 +00:00
2021-04-06 01:15:32 +00:00
existing_index = ( out_dir / SQL_INDEX_FILENAME ) . exists ( )
2019-04-27 21:26:24 +00:00
if is_empty and not existing_index :
2021-02-18 07:34:42 +00:00
print ( ' {green} [+] Initializing a new ArchiveBox v {} collection... {reset} ' . format ( VERSION , * * ANSI ) )
print ( ' {green} ---------------------------------------------------------------------- {reset} ' . format ( * * ANSI ) )
2019-04-27 21:26:24 +00:00
elif existing_index :
2021-02-18 07:34:42 +00:00
# TODO: properly detect and print the existing version in current index as well
2024-09-03 07:58:50 +00:00
print ( ' {green} [*] Verifying and updating existing ArchiveBox collection to v {} ... {reset} ' . format ( VERSION , * * ANSI ) )
2021-02-18 07:34:42 +00:00
print ( ' {green} ---------------------------------------------------------------------- {reset} ' . format ( * * ANSI ) )
2019-04-27 21:26:24 +00:00
else :
2019-05-01 06:27:50 +00:00
if force :
2020-08-18 16:45:27 +00:00
stderr ( ' [!] This folder appears to already have files in it, but no index.sqlite3 is present. ' , color = ' lightyellow ' )
2019-05-01 06:27:50 +00:00
stderr ( ' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files). ' )
else :
stderr (
2020-08-18 16:45:27 +00:00
( " {red} [X] This folder appears to already have files in it, but no index.sqlite3 present. {reset} \n \n "
2019-05-01 06:27:50 +00:00
" You must run init in a completely empty directory, or an existing data folder. \n \n "
" {lightred} Hint: {reset} To import an existing data folder make sure to cd into the folder first, \n "
" then run and run ' archivebox init ' to pick up where you left off. \n \n "
" (Always make sure your data folder is backed up first before updating ArchiveBox) "
2024-09-12 08:58:13 +00:00
) . format ( * * ANSI )
2019-05-01 06:27:50 +00:00
)
raise SystemExit ( 2 )
2019-04-27 21:26:24 +00:00
if existing_index :
print ( ' \n {green} [*] Verifying archive folder structure... {reset} ' . format ( * * ANSI ) )
else :
print ( ' \n {green} [+] Building archive folder structure... {reset} ' . format ( * * ANSI ) )
2021-02-18 07:34:42 +00:00
print ( f ' + ./ { ARCHIVE_DIR . relative_to ( OUTPUT_DIR ) } , ./ { SOURCES_DIR . relative_to ( OUTPUT_DIR ) } , ./ { LOGS_DIR . relative_to ( OUTPUT_DIR ) } ... ' )
2020-09-03 22:26:49 +00:00
Path ( SOURCES_DIR ) . mkdir ( exist_ok = True )
Path ( ARCHIVE_DIR ) . mkdir ( exist_ok = True )
Path ( LOGS_DIR ) . mkdir ( exist_ok = True )
2021-02-18 07:34:42 +00:00
print ( f ' + ./ { CONFIG_FILE . relative_to ( OUTPUT_DIR ) } ... ' )
2019-04-27 21:26:24 +00:00
write_config_file ( { } , out_dir = out_dir )
2021-02-16 07:49:31 +00:00
2021-04-06 01:15:32 +00:00
if ( out_dir / SQL_INDEX_FILENAME ) . exists ( ) :
2021-02-18 07:34:42 +00:00
print ( ' \n {green} [*] Verifying main SQL index and running any migrations needed... {reset} ' . format ( * * ANSI ) )
2019-04-27 21:26:24 +00:00
else :
2021-02-18 07:34:42 +00:00
print ( ' \n {green} [+] Building main SQL index and running initial migrations... {reset} ' . format ( * * ANSI ) )
2019-04-27 21:26:24 +00:00
2021-04-06 01:15:32 +00:00
DATABASE_FILE = out_dir / SQL_INDEX_FILENAME
2019-04-27 21:26:24 +00:00
for migration_line in apply_migrations ( out_dir ) :
print ( f ' { migration_line } ' )
2020-09-03 22:26:49 +00:00
assert DATABASE_FILE . exists ( )
2021-02-18 07:34:42 +00:00
print ( )
print ( f ' √ ./ { DATABASE_FILE . relative_to ( OUTPUT_DIR ) } ' )
2019-04-27 21:26:24 +00:00
# from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
# call_command("createsuperuser", interactive=True)
print ( )
2021-02-18 07:34:42 +00:00
print ( ' {green} [*] Checking links from indexes and archive folders (safe to Ctrl+C)... {reset} ' . format ( * * ANSI ) )
2019-04-27 21:26:24 +00:00
2020-12-11 21:27:15 +00:00
all_links = Snapshot . objects . none ( )
2020-08-20 17:59:50 +00:00
pending_links : Dict [ str , Link ] = { }
2019-04-27 21:26:24 +00:00
if existing_index :
2020-08-20 17:59:50 +00:00
all_links = load_main_index ( out_dir = out_dir , warn = False )
print ( ' √ Loaded {} links from existing main index. ' . format ( all_links . count ( ) ) )
2019-04-27 21:26:24 +00:00
2021-02-16 07:49:31 +00:00
if quick :
print ( ' > Skipping full snapshot directory check (quick mode) ' )
else :
2021-02-18 07:34:42 +00:00
try :
# Links in data folders that dont match their timestamp
fixed , cant_fix = fix_invalid_folder_locations ( out_dir = out_dir )
if fixed :
print ( ' {lightyellow} √ Fixed {} data directory locations that didn \' t match their link timestamps. {reset} ' . format ( len ( fixed ) , * * ANSI ) )
if cant_fix :
print ( ' {lightyellow} ! Could not fix {} data directory locations due to conflicts with existing folders. {reset} ' . format ( len ( cant_fix ) , * * ANSI ) )
# Links in JSON index but not in main index
orphaned_json_links = {
link . url : link
for link in parse_json_main_index ( out_dir )
if not all_links . filter ( url = link . url ) . exists ( )
}
if orphaned_json_links :
pending_links . update ( orphaned_json_links )
print ( ' {lightyellow} √ Added {} orphaned links from existing JSON index... {reset} ' . format ( len ( orphaned_json_links ) , * * ANSI ) )
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link . url : link
for link in parse_json_links_details ( out_dir )
if not all_links . filter ( url = link . url ) . exists ( )
}
if orphaned_data_dir_links :
pending_links . update ( orphaned_data_dir_links )
print ( ' {lightyellow} √ Added {} orphaned links from existing archive directories. {reset} ' . format ( len ( orphaned_data_dir_links ) , * * ANSI ) )
# Links in invalid/duplicate data dirs
invalid_folders = {
folder : link
for folder , link in get_invalid_folders ( all_links , out_dir = out_dir ) . items ( )
}
if invalid_folders :
print ( ' {lightyellow} ! Skipped adding {} invalid link data directories. {reset} ' . format ( len ( invalid_folders ) , * * ANSI ) )
print ( ' X ' + ' \n X ' . join ( f ' ./ { Path ( folder ) . relative_to ( OUTPUT_DIR ) } { link } ' for folder , link in invalid_folders . items ( ) ) )
print ( )
print ( ' {lightred} Hint: {reset} For more information about the link data directories that were skipped, run: ' . format ( * * ANSI ) )
print ( ' archivebox status ' )
print ( ' archivebox list --status=invalid ' )
except ( KeyboardInterrupt , SystemExit ) :
stderr ( )
stderr ( ' [x] Stopped checking archive directories due to Ctrl-C/SIGTERM ' , color = ' red ' )
stderr ( ' Your archive data is safe, but you should re-run `archivebox init` to finish the process later. ' )
stderr ( )
stderr ( ' {lightred} Hint: {reset} In the future you can run a quick init without checking dirs like so: ' . format ( * * ANSI ) )
stderr ( ' archivebox init --quick ' )
raise SystemExit ( 1 )
2021-02-15 19:52:10 +00:00
write_main_index ( list ( pending_links . values ( ) ) , out_dir = out_dir )
2019-04-27 21:26:24 +00:00
2021-02-18 07:34:42 +00:00
print ( ' \n {green} ---------------------------------------------------------------------- {reset} ' . format ( * * ANSI ) )
2023-10-18 08:07:54 +00:00
from django . contrib . auth . models import User
2023-10-18 17:07:36 +00:00
if ( ADMIN_USERNAME and ADMIN_PASSWORD ) and not User . objects . filter ( username = ADMIN_USERNAME ) . exists ( ) :
print ( ' {green} [+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user. {reset} ' . format ( * * ANSI ) )
2023-10-18 18:47:55 +00:00
User . objects . create_superuser ( username = ADMIN_USERNAME , password = ADMIN_PASSWORD )
2023-10-18 08:07:54 +00:00
2019-04-27 21:26:24 +00:00
if existing_index :
print ( ' {green} [√] Done. Verified and updated the existing ArchiveBox collection. {reset} ' . format ( * * ANSI ) )
else :
2021-02-16 11:20:05 +00:00
print ( ' {green} [√] Done. A new ArchiveBox collection was initialized ( {} links). {reset} ' . format ( len ( all_links ) + len ( pending_links ) , * * ANSI ) )
2019-04-27 21:26:24 +00:00
2021-04-06 01:15:32 +00:00
json_index = out_dir / JSON_INDEX_FILENAME
html_index = out_dir / HTML_INDEX_FILENAME
2020-10-19 15:18:11 +00:00
index_name = f " { date . today ( ) } _index_old "
if json_index . exists ( ) :
json_index . rename ( f " { index_name } .json " )
if html_index . exists ( ) :
html_index . rename ( f " { index_name } .html " )
2021-04-06 01:15:32 +00:00
if setup :
run_subcommand ( ' setup ' , pwd = out_dir )
2021-04-06 03:21:07 +00:00
2021-04-06 03:17:07 +00:00
if Snapshot . objects . count ( ) < 25 : # hide the hints for experienced users
print ( )
print ( ' {lightred} Hint: {reset} To view your archive index, run: ' . format ( * * ANSI ) )
print ( ' archivebox server # then visit http://127.0.0.1:8000 ' )
print ( )
print ( ' To add new links, you can run: ' )
2022-04-19 21:25:49 +00:00
print ( " archivebox add < ~/some/path/to/list_of_links.txt " )
2021-04-06 03:17:07 +00:00
print ( )
print ( ' For more usage and examples, run: ' )
print ( ' archivebox help ' )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
2020-09-03 22:26:49 +00:00
def status ( out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Print out some info and statistics about the archive collection """
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
2020-06-26 03:32:01 +00:00
from core . models import Snapshot
from django . contrib . auth import get_user_model
User = get_user_model ( )
print ( ' {green} [*] Scanning archive main index... {reset} ' . format ( * * ANSI ) )
print ( ANSI [ ' lightyellow ' ] , f ' { out_dir } /* ' , ANSI [ ' reset ' ] )
2019-04-27 21:26:24 +00:00
num_bytes , num_dirs , num_files = get_dir_size ( out_dir , recursive = False , pattern = ' index. ' )
2019-05-01 03:13:04 +00:00
size = printable_filesize ( num_bytes )
2020-06-26 03:32:01 +00:00
print ( f ' Index size: { size } across { num_files } files ' )
2019-04-27 21:26:24 +00:00
print ( )
2020-08-20 14:18:25 +00:00
links = load_main_index ( out_dir = out_dir )
num_sql_links = links . count ( )
2019-04-27 21:26:24 +00:00
num_link_details = sum ( 1 for link in parse_json_links_details ( out_dir = out_dir ) )
print ( f ' > SQL Main Index: { num_sql_links } links ' . ljust ( 36 ) , f ' (found in { SQL_INDEX_FILENAME } ) ' )
print ( f ' > JSON Link Details: { num_link_details } links ' . ljust ( 36 ) , f ' (found in { ARCHIVE_DIR_NAME } /*/index.json) ' )
print ( )
2020-06-26 03:32:01 +00:00
print ( ' {green} [*] Scanning archive data directories... {reset} ' . format ( * * ANSI ) )
print ( ANSI [ ' lightyellow ' ] , f ' { ARCHIVE_DIR } /* ' , ANSI [ ' reset ' ] )
2019-04-27 21:26:24 +00:00
num_bytes , num_dirs , num_files = get_dir_size ( ARCHIVE_DIR )
2019-05-01 03:13:04 +00:00
size = printable_filesize ( num_bytes )
2019-04-27 21:26:24 +00:00
print ( f ' Size: { size } across { num_files } files in { num_dirs } directories ' )
2020-06-26 03:32:01 +00:00
print ( ANSI [ ' black ' ] )
2019-04-27 21:26:24 +00:00
num_indexed = len ( get_indexed_folders ( links , out_dir = out_dir ) )
num_archived = len ( get_archived_folders ( links , out_dir = out_dir ) )
num_unarchived = len ( get_unarchived_folders ( links , out_dir = out_dir ) )
print ( f ' > indexed: { num_indexed } ' . ljust ( 36 ) , f ' ( { get_indexed_folders . __doc__ } ) ' )
print ( f ' > archived: { num_archived } ' . ljust ( 36 ) , f ' ( { get_archived_folders . __doc__ } ) ' )
print ( f ' > unarchived: { num_unarchived } ' . ljust ( 36 ) , f ' ( { get_unarchived_folders . __doc__ } ) ' )
num_present = len ( get_present_folders ( links , out_dir = out_dir ) )
num_valid = len ( get_valid_folders ( links , out_dir = out_dir ) )
print ( )
print ( f ' > present: { num_present } ' . ljust ( 36 ) , f ' ( { get_present_folders . __doc__ } ) ' )
print ( f ' > valid: { num_valid } ' . ljust ( 36 ) , f ' ( { get_valid_folders . __doc__ } ) ' )
duplicate = get_duplicate_folders ( links , out_dir = out_dir )
orphaned = get_orphaned_folders ( links , out_dir = out_dir )
corrupted = get_corrupted_folders ( links , out_dir = out_dir )
unrecognized = get_unrecognized_folders ( links , out_dir = out_dir )
num_invalid = len ( { * * duplicate , * * orphaned , * * corrupted , * * unrecognized } )
print ( f ' > invalid: { num_invalid } ' . ljust ( 36 ) , f ' ( { get_invalid_folders . __doc__ } ) ' )
print ( f ' > duplicate: { len ( duplicate ) } ' . ljust ( 36 ) , f ' ( { get_duplicate_folders . __doc__ } ) ' )
print ( f ' > orphaned: { len ( orphaned ) } ' . ljust ( 36 ) , f ' ( { get_orphaned_folders . __doc__ } ) ' )
print ( f ' > corrupted: { len ( corrupted ) } ' . ljust ( 36 ) , f ' ( { get_corrupted_folders . __doc__ } ) ' )
print ( f ' > unrecognized: { len ( unrecognized ) } ' . ljust ( 36 ) , f ' ( { get_unrecognized_folders . __doc__ } ) ' )
2020-06-26 03:32:01 +00:00
print ( ANSI [ ' reset ' ] )
2019-04-27 21:26:24 +00:00
if num_indexed :
print ( ' {lightred} Hint: {reset} You can list link data directories by status like so: ' . format ( * * ANSI ) )
print ( ' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.) ' )
if orphaned :
print ( ' {lightred} Hint: {reset} To automatically import orphaned data directories into the main index, run: ' . format ( * * ANSI ) )
print ( ' archivebox init ' )
if num_invalid :
print ( ' {lightred} Hint: {reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run: ' . format ( * * ANSI ) )
print ( ' archivebox init ' )
print ( )
2020-06-26 03:32:01 +00:00
print ( ' {green} [*] Scanning recent archive changes and user logins: {reset} ' . format ( * * ANSI ) )
print ( ANSI [ ' lightyellow ' ] , f ' { LOGS_DIR } /* ' , ANSI [ ' reset ' ] )
users = get_admins ( ) . values_list ( ' username ' , flat = True )
print ( f ' UI users { len ( users ) } : { " , " . join ( users ) } ' )
last_login = User . objects . order_by ( ' last_login ' ) . last ( )
2020-07-24 17:25:25 +00:00
if last_login :
print ( f ' Last UI login: { last_login . username } @ { str ( last_login . last_login ) [ : 16 ] } ' )
2024-09-05 06:42:36 +00:00
last_downloaded = Snapshot . objects . order_by ( ' downloaded_at ' ) . last ( )
if last_downloaded :
print ( f ' Last changes: { str ( last_downloaded . downloaded_at ) [ : 16 ] } ' )
2020-06-26 03:32:01 +00:00
if not users :
print ( )
print ( ' {lightred} Hint: {reset} You can create an admin user by running: ' . format ( * * ANSI ) )
print ( ' archivebox manage createsuperuser ' )
print ( )
2024-09-05 06:42:36 +00:00
for snapshot in links . order_by ( ' -downloaded_at ' ) [ : 10 ] :
if not snapshot . downloaded_at :
2020-06-26 03:32:01 +00:00
continue
print (
ANSI [ ' black ' ] ,
(
2024-09-05 06:42:36 +00:00
f ' > { str ( snapshot . downloaded_at ) [ : 16 ] } '
2020-06-26 03:32:01 +00:00
f ' [ { snapshot . num_outputs } { ( " X " , " √ " ) [ snapshot . is_archived ] } { printable_filesize ( snapshot . archive_size ) } ] '
f ' " { snapshot . title } " : { snapshot . url } '
) [ : TERM_WIDTH ( ) ] ,
ANSI [ ' reset ' ] ,
)
print ( ANSI [ ' black ' ] , ' ... ' , ANSI [ ' reset ' ] )
2019-04-27 21:26:24 +00:00
2020-07-29 16:19:06 +00:00
@enforce_types
2024-08-21 02:43:07 +00:00
def oneshot ( url : str , extractors : str = " " , out_dir : Path = OUTPUT_DIR , created_by_id : int | None = None ) - > List [ Link ] :
2020-07-31 15:28:30 +00:00
"""
Create a single URL archive folder with an index . json and index . html , and all the archive method outputs .
You can run this to archive single pages without needing to create a whole collection with archivebox init .
"""
2020-07-31 14:05:40 +00:00
oneshot_link , _ = parse_links_memory ( [ url ] )
if len ( oneshot_link ) > 1 :
stderr (
' [X] You should pass a single url to the oneshot command ' ,
color = ' red '
)
raise SystemExit ( 2 )
2020-12-11 13:48:46 +00:00
methods = extractors . split ( " , " ) if extractors else ignore_methods ( [ ' title ' ] )
2024-08-21 02:43:07 +00:00
archive_link ( oneshot_link [ 0 ] , out_dir = out_dir , methods = methods , created_by_id = created_by_id )
2020-07-31 14:05:40 +00:00
return oneshot_link
2020-07-29 16:19:06 +00:00
2019-04-27 21:26:24 +00:00
@enforce_types
2020-07-13 15:26:30 +00:00
def add ( urls : Union [ str , List [ str ] ] ,
2021-03-27 07:57:05 +00:00
tag : str = ' ' ,
2020-07-08 13:17:47 +00:00
depth : int = 0 ,
2022-05-10 03:15:55 +00:00
update : bool = not ONLY_NEW ,
update_all : bool = False ,
2019-04-27 21:26:24 +00:00
index_only : bool = False ,
2020-08-18 08:37:54 +00:00
overwrite : bool = False ,
2021-04-10 08:19:30 +00:00
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
2020-08-18 12:22:34 +00:00
init : bool = False ,
2020-12-11 13:48:46 +00:00
extractors : str = " " ,
2021-03-20 16:38:00 +00:00
parser : str = " auto " ,
2024-08-21 02:28:28 +00:00
created_by_id : int | None = None ,
2020-12-11 13:48:46 +00:00
out_dir : Path = OUTPUT_DIR ) - > List [ Link ] :
2019-05-01 03:10:48 +00:00
""" Add a new URL or list of URLs to your archive """
2019-04-27 21:26:24 +00:00
2024-01-04 04:31:46 +00:00
from core . models import Snapshot , Tag
2024-09-25 05:01:18 +00:00
# from queues.supervisor_util import start_cli_workers, tail_worker_logs
# from queues.tasks import bg_archive_link
2024-09-25 04:17:51 +00:00
2021-03-27 07:57:05 +00:00
2020-07-13 15:26:30 +00:00
assert depth in ( 0 , 1 ) , ' Depth must be 0 or 1 (depth >1 is not supported yet) '
2019-04-27 21:26:24 +00:00
2020-11-13 18:10:17 +00:00
extractors = extractors . split ( " , " ) if extractors else [ ]
2020-11-13 16:41:50 +00:00
2020-08-18 12:22:34 +00:00
if init :
run_subcommand ( ' init ' , stdin = None , pwd = out_dir )
2020-07-13 15:26:30 +00:00
# Load list of links from the existing index
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
check_dependencies ( CONFIG )
2024-09-25 05:01:18 +00:00
# worker = start_cli_workers()
2024-09-25 04:17:51 +00:00
2019-04-27 21:26:24 +00:00
new_links : List [ Link ] = [ ]
2020-08-21 14:57:29 +00:00
all_links = load_main_index ( out_dir = out_dir )
2020-07-08 13:17:47 +00:00
2020-07-13 15:26:30 +00:00
log_importing_started ( urls = urls , depth = depth , index_only = index_only )
if isinstance ( urls , str ) :
# save verbatim stdin to sources
write_ahead_log = save_text_as_source ( urls , filename = ' {ts} -import.txt ' , out_dir = out_dir )
elif isinstance ( urls , list ) :
# save verbatim args to sources
write_ahead_log = save_text_as_source ( ' \n ' . join ( urls ) , filename = ' {ts} -import.txt ' , out_dir = out_dir )
2022-05-10 04:21:26 +00:00
2021-03-20 16:38:00 +00:00
new_links + = parse_links_from_source ( write_ahead_log , root_url = None , parser = parser )
2020-07-13 15:26:30 +00:00
# If we're going one level deeper, download each link and look for more links
2020-07-13 19:48:25 +00:00
new_links_depth = [ ]
2020-07-13 15:26:30 +00:00
if new_links and depth == 1 :
log_crawl_started ( new_links )
for new_link in new_links :
2022-05-10 02:56:24 +00:00
try :
downloaded_file = save_file_as_source ( new_link . url , filename = f ' { new_link . timestamp } -crawl- { new_link . domain } .txt ' , out_dir = out_dir )
new_links_depth + = parse_links_from_source ( downloaded_file , root_url = new_link . url )
except Exception as err :
stderr ( ' [!] Failed to get contents of URL {new_link.url} ' , err , color = ' red ' )
2020-08-18 08:37:54 +00:00
2020-08-18 12:29:05 +00:00
imported_links = list ( { link . url : link for link in ( new_links + new_links_depth ) } . values ( ) )
2021-03-27 07:57:05 +00:00
2020-08-21 14:57:29 +00:00
new_links = dedupe_links ( all_links , imported_links )
2024-08-21 02:28:28 +00:00
write_main_index ( links = new_links , out_dir = out_dir , created_by_id = created_by_id )
2020-08-21 14:57:29 +00:00
all_links = load_main_index ( out_dir = out_dir )
2019-04-27 21:26:24 +00:00
2024-01-04 04:31:46 +00:00
tags = [
2024-08-21 02:28:28 +00:00
Tag . objects . get_or_create ( name = name . strip ( ) , defaults = { ' created_by_id ' : created_by_id } ) [ 0 ]
2024-01-04 04:31:46 +00:00
for name in tag . split ( ' , ' )
if name . strip ( )
]
if tags :
for link in imported_links :
snapshot = Snapshot . objects . get ( url = link . url )
snapshot . tags . add ( * tags )
snapshot . tags_str ( nocache = True )
snapshot . save ( )
# print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
2019-04-27 21:26:24 +00:00
if index_only :
2021-03-27 07:57:05 +00:00
# mock archive all the links using the fake index_only extractor method in order to update their state
2021-02-16 01:49:23 +00:00
if overwrite :
2024-08-21 02:43:07 +00:00
archive_links ( imported_links , overwrite = overwrite , methods = [ ' index_only ' ] , out_dir = out_dir , created_by_id = created_by_id )
2021-02-16 01:49:23 +00:00
else :
2024-08-21 02:43:07 +00:00
archive_links ( new_links , overwrite = False , methods = [ ' index_only ' ] , out_dir = out_dir , created_by_id = created_by_id )
2021-03-27 07:57:05 +00:00
else :
# fully run the archive extractor methods for each link
archive_kwargs = {
" out_dir " : out_dir ,
2024-08-21 02:43:07 +00:00
" created_by_id " : created_by_id ,
2021-03-27 07:57:05 +00:00
}
if extractors :
archive_kwargs [ " methods " ] = extractors
2022-05-10 03:15:55 +00:00
stderr ( )
ts = datetime . now ( timezone . utc ) . strftime ( ' % Y- % m- %d % H: % M: % S ' )
if update :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving + updating { len ( imported_links ) } / { len ( all_links ) } ' , len ( imported_links ) , ' URLs from added set... ' , color = ' green ' )
2022-05-10 03:15:55 +00:00
archive_links ( imported_links , overwrite = overwrite , * * archive_kwargs )
elif update_all :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving + updating { len ( all_links ) } / { len ( all_links ) } ' , len ( all_links ) , ' URLs from entire library... ' , color = ' green ' )
2021-03-27 07:57:05 +00:00
archive_links ( all_links , overwrite = overwrite , * * archive_kwargs )
elif overwrite :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving + overwriting { len ( imported_links ) } / { len ( all_links ) } ' , len ( imported_links ) , ' URLs from added set... ' , color = ' green ' )
2021-03-27 07:57:05 +00:00
archive_links ( imported_links , overwrite = True , * * archive_kwargs )
elif new_links :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving { len ( new_links ) } / { len ( all_links ) } URLs from added set... ' , color = ' green ' )
2021-03-27 07:57:05 +00:00
archive_links ( new_links , overwrite = False , * * archive_kwargs )
2024-09-25 05:01:18 +00:00
# tail_worker_logs(worker['stdout_logfile'])
2024-09-25 04:17:51 +00:00
2023-11-28 04:58:13 +00:00
if CAN_UPGRADE :
2023-12-19 18:01:08 +00:00
hint ( f " There ' s a new version of ArchiveBox available! Your current version is { VERSION } . You can upgrade to { VERSIONS_AVAILABLE [ ' recommended_version ' ] [ ' tag_name ' ] } ( { VERSIONS_AVAILABLE [ ' recommended_version ' ] [ ' html_url ' ] } ). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives \n " )
2021-03-27 08:30:15 +00:00
2024-04-25 10:56:22 +00:00
return new_links
2019-04-27 21:26:24 +00:00
@enforce_types
def remove ( filter_str : Optional [ str ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
filter_type : str = ' exact ' ,
2020-08-21 17:42:08 +00:00
snapshots : Optional [ QuerySet ] = None ,
2019-04-27 21:26:24 +00:00
after : Optional [ float ] = None ,
before : Optional [ float ] = None ,
yes : bool = False ,
delete : bool = False ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > List [ Link ] :
2019-05-01 03:10:48 +00:00
""" Remove the specified URLs from the archive """
2019-04-27 21:26:24 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
2020-09-08 16:05:11 +00:00
if snapshots is None :
2020-07-28 09:52:15 +00:00
if filter_str and filter_patterns :
stderr (
' [X] You should pass either a pattern as an argument, '
' or pass a list of patterns via stdin, but not both. \n ' ,
color = ' red ' ,
)
raise SystemExit ( 2 )
elif not ( filter_str or filter_patterns ) :
stderr (
' [X] You should pass either a pattern as an argument, '
' or pass a list of patterns via stdin. ' ,
color = ' red ' ,
)
stderr ( )
2020-08-18 12:22:34 +00:00
hint ( ( ' To remove all urls you can run: ' ,
2020-08-21 17:42:08 +00:00
' archivebox remove --filter-type=regex " .* " ' ) )
2020-07-28 09:52:15 +00:00
stderr ( )
raise SystemExit ( 2 )
elif filter_str :
filter_patterns = [ ptn . strip ( ) for ptn in filter_str . split ( ' \n ' ) ]
2020-08-21 17:42:08 +00:00
list_kwargs = {
" filter_patterns " : filter_patterns ,
" filter_type " : filter_type ,
" after " : after ,
" before " : before ,
}
if snapshots :
list_kwargs [ " snapshots " ] = snapshots
log_list_started ( filter_patterns , filter_type )
timer = TimedProgress ( 360 , prefix = ' ' )
try :
snapshots = list_links ( * * list_kwargs )
finally :
timer . end ( )
2020-07-28 09:52:15 +00:00
2019-04-27 21:26:24 +00:00
2020-08-21 17:42:08 +00:00
if not snapshots . exists ( ) :
2019-04-27 21:26:24 +00:00
log_removal_finished ( 0 , 0 )
raise SystemExit ( 1 )
2020-08-21 17:42:08 +00:00
log_links = [ link . as_link ( ) for link in snapshots ]
log_list_finished ( log_links )
log_removal_started ( log_links , yes = yes , delete = delete )
2019-04-27 21:26:24 +00:00
timer = TimedProgress ( 360 , prefix = ' ' )
try :
2020-08-21 17:42:08 +00:00
for snapshot in snapshots :
if delete :
shutil . rmtree ( snapshot . as_link ( ) . link_dir , ignore_errors = True )
2019-04-27 21:26:24 +00:00
finally :
timer . end ( )
2020-08-21 17:42:08 +00:00
to_remove = snapshots . count ( )
2024-09-24 09:13:01 +00:00
from . search import flush_search_index
2020-11-19 23:19:33 +00:00
flush_search_index ( snapshots = snapshots )
2020-11-21 18:02:58 +00:00
remove_from_sql_main_index ( snapshots = snapshots , out_dir = out_dir )
2020-08-21 17:42:08 +00:00
all_snapshots = load_main_index ( out_dir = out_dir )
log_removal_finished ( all_snapshots . count ( ) , to_remove )
2019-04-27 21:26:24 +00:00
2020-08-21 17:42:08 +00:00
return all_snapshots
2019-04-27 21:26:24 +00:00
@enforce_types
def update ( resume : Optional [ float ] = None ,
2019-05-01 03:10:48 +00:00
only_new : bool = ONLY_NEW ,
2019-04-27 21:26:24 +00:00
index_only : bool = False ,
overwrite : bool = False ,
filter_patterns_str : Optional [ str ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
filter_type : Optional [ str ] = None ,
status : Optional [ str ] = None ,
after : Optional [ str ] = None ,
before : Optional [ str ] = None ,
2020-12-05 17:20:47 +00:00
extractors : str = " " ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > List [ Link ] :
2019-05-01 03:10:48 +00:00
""" Import any new links from subscriptions and retry any previously failed/skipped links """
2019-04-27 21:26:24 +00:00
2024-02-22 12:49:09 +00:00
from core . models import ArchiveResult
2024-09-24 09:13:01 +00:00
from . search import index_links
2024-09-25 05:01:18 +00:00
# from .queues.supervisor_util import start_cli_workers
2024-09-25 04:17:51 +00:00
2024-02-22 12:49:09 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
check_dependencies ( CONFIG )
2024-09-25 05:01:18 +00:00
# start_cli_workers()
2020-08-22 13:59:25 +00:00
new_links : List [ Link ] = [ ] # TODO: Remove input argument: only_new
2019-04-27 21:26:24 +00:00
2020-12-05 17:20:47 +00:00
extractors = extractors . split ( " , " ) if extractors else [ ]
2020-08-22 13:59:25 +00:00
# Step 1: Filter for selected_links
2024-02-22 12:49:09 +00:00
print ( ' [*] Finding matching Snapshots to update... ' )
print ( f ' - Filtering by { " " . join ( filter_patterns ) } ( { filter_type } ) { before =} { after =} { status =} ... ' )
2020-08-22 13:59:25 +00:00
matching_snapshots = list_links (
2019-04-27 21:26:24 +00:00
filter_patterns = filter_patterns ,
filter_type = filter_type ,
before = before ,
after = after ,
)
2024-02-22 12:49:09 +00:00
print ( f ' - Checking { matching_snapshots . count ( ) } snapshot folders for existing data with { status =} ... ' )
2019-04-27 21:26:24 +00:00
matching_folders = list_folders (
2020-08-22 13:59:25 +00:00
links = matching_snapshots ,
2019-04-27 21:26:24 +00:00
status = status ,
out_dir = out_dir ,
)
2024-02-22 12:49:09 +00:00
all_links = ( link for link in matching_folders . values ( ) if link )
print ( ' - Sorting by most unfinished -> least unfinished + date archived... ' )
all_links = sorted ( all_links , key = lambda link : ( ArchiveResult . objects . filter ( snapshot__url = link . url ) . count ( ) , link . timestamp ) )
2019-04-27 21:26:24 +00:00
if index_only :
2020-12-05 17:10:17 +00:00
for link in all_links :
write_link_details ( link , out_dir = out_dir , skip_sql_index = True )
2020-11-23 20:51:59 +00:00
index_links ( all_links , out_dir = out_dir )
2019-04-27 21:26:24 +00:00
return all_links
2020-08-22 13:59:25 +00:00
# Step 2: Run the archive methods for each link
2020-07-13 15:26:30 +00:00
to_archive = new_links if only_new else all_links
2020-08-18 08:39:39 +00:00
if resume :
to_archive = [
link for link in to_archive
if link . timestamp > = str ( resume )
]
if not to_archive :
2020-08-18 08:42:39 +00:00
stderr ( ' ' )
stderr ( f ' [√] Nothing found to resume after { resume } ' , color = ' green ' )
2020-08-18 08:39:39 +00:00
return all_links
2020-12-05 17:20:47 +00:00
archive_kwargs = {
" out_dir " : out_dir ,
}
if extractors :
archive_kwargs [ " methods " ] = extractors
2024-02-22 12:49:09 +00:00
2020-12-05 17:20:47 +00:00
archive_links ( to_archive , overwrite = overwrite , * * archive_kwargs )
2019-04-27 21:26:24 +00:00
# Step 4: Re-write links index with updated titles, icons, and resources
2020-08-22 13:59:25 +00:00
all_links = load_main_index ( out_dir = out_dir )
2019-04-27 21:26:24 +00:00
return all_links
@enforce_types
def list_all ( filter_patterns_str : Optional [ str ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
filter_type : str = ' exact ' ,
status : Optional [ str ] = None ,
after : Optional [ float ] = None ,
before : Optional [ float ] = None ,
sort : Optional [ str ] = None ,
csv : Optional [ str ] = None ,
2019-05-01 03:10:48 +00:00
json : bool = False ,
2020-08-19 18:02:12 +00:00
html : bool = False ,
2020-09-08 14:17:10 +00:00
with_headers : bool = False ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > Iterable [ Link ] :
2019-05-01 03:10:48 +00:00
""" List, filter, and export information about archive entries """
2019-04-27 21:26:24 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
if filter_patterns and filter_patterns_str :
stderr (
' [X] You should either pass filter patterns as an arguments '
' or via stdin, but not both. \n ' ,
color = ' red ' ,
)
raise SystemExit ( 2 )
elif filter_patterns_str :
filter_patterns = filter_patterns_str . split ( ' \n ' )
2020-08-21 17:42:08 +00:00
snapshots = list_links (
2019-04-27 21:26:24 +00:00
filter_patterns = filter_patterns ,
filter_type = filter_type ,
before = before ,
after = after ,
)
2020-08-22 14:11:17 +00:00
if sort :
snapshots = snapshots . order_by ( sort )
2019-04-27 21:26:24 +00:00
folders = list_folders (
2020-08-22 13:59:25 +00:00
links = snapshots ,
2019-04-27 21:26:24 +00:00
status = status ,
out_dir = out_dir ,
)
2020-11-28 17:28:39 +00:00
if json :
output = generate_json_index_from_links ( folders . values ( ) , with_headers )
elif html :
output = generate_index_from_links ( folders . values ( ) , with_headers )
elif csv :
output = links_to_csv ( folders . values ( ) , cols = csv . split ( ' , ' ) , header = with_headers )
else :
output = printable_folders ( folders , with_headers = with_headers )
print ( output )
2019-05-01 03:13:04 +00:00
return folders
2019-04-27 21:26:24 +00:00
@enforce_types
2020-08-21 17:42:08 +00:00
def list_links ( snapshots : Optional [ QuerySet ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
2019-04-27 21:26:24 +00:00
filter_type : str = ' exact ' ,
after : Optional [ float ] = None ,
before : Optional [ float ] = None ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > Iterable [ Link ] :
2019-04-27 21:26:24 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
2020-08-21 17:42:08 +00:00
if snapshots :
all_snapshots = snapshots
else :
all_snapshots = load_main_index ( out_dir = out_dir )
if after is not None :
2021-02-16 01:48:51 +00:00
all_snapshots = all_snapshots . filter ( timestamp__gte = after )
2020-08-21 17:42:08 +00:00
if before is not None :
2021-02-16 01:48:51 +00:00
all_snapshots = all_snapshots . filter ( timestamp__lt = before )
2020-08-21 17:42:08 +00:00
if filter_patterns :
all_snapshots = snapshot_filter ( all_snapshots , filter_patterns , filter_type )
2021-02-16 01:48:51 +00:00
if not all_snapshots :
stderr ( ' [!] No Snapshots matched your filters: ' , filter_patterns , f ' ( { filter_type } ) ' , color = ' lightyellow ' )
2020-08-21 17:42:08 +00:00
return all_snapshots
2019-04-27 21:26:24 +00:00
@enforce_types
def list_folders ( links : List [ Link ] ,
status : str ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > Dict [ str , Optional [ Link ] ] :
2019-04-27 21:26:24 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
2020-08-22 13:59:25 +00:00
STATUS_FUNCTIONS = {
" indexed " : get_indexed_folders ,
" archived " : get_archived_folders ,
" unarchived " : get_unarchived_folders ,
" present " : get_present_folders ,
" valid " : get_valid_folders ,
" invalid " : get_invalid_folders ,
" duplicate " : get_duplicate_folders ,
" orphaned " : get_orphaned_folders ,
" corrupted " : get_corrupted_folders ,
" unrecognized " : get_unrecognized_folders ,
}
try :
return STATUS_FUNCTIONS [ status ] ( links , out_dir = out_dir )
except KeyError :
raise ValueError ( ' Status not recognized. ' )
2019-04-27 21:26:24 +00:00
2021-04-05 23:51:57 +00:00
@enforce_types
def setup ( out_dir : Path = OUTPUT_DIR ) - > None :
""" Automatically install all ArchiveBox dependencies and extras """
2021-04-06 01:15:10 +00:00
if not ( out_dir / ARCHIVE_DIR_NAME ) . exists ( ) :
run_subcommand ( ' init ' , stdin = None , pwd = out_dir )
setup_django ( out_dir = out_dir , check_db = True )
2024-08-28 10:01:35 +00:00
from django . contrib . auth import get_user_model
User = get_user_model ( )
2021-04-05 23:51:57 +00:00
2021-04-06 01:15:10 +00:00
if not User . objects . filter ( is_superuser = True ) . exists ( ) :
stderr ( ' \n [+] Creating new admin user for the Web UI... ' , color = ' green ' )
run_subcommand ( ' manage ' , subcommand_args = [ ' createsuperuser ' ] , pwd = out_dir )
stderr ( ' \n [+] Installing enabled ArchiveBox dependencies automatically... ' , color = ' green ' )
2021-04-05 23:51:57 +00:00
stderr ( ' \n Installing YOUTUBEDL_BINARY automatically using pip... ' )
2021-04-24 07:28:49 +00:00
if YOUTUBEDL_VERSION :
print ( f ' { YOUTUBEDL_VERSION } is already installed ' , YOUTUBEDL_BINARY )
else :
try :
run_shell ( [
PYTHON_BINARY , ' -m ' , ' pip ' ,
' install ' ,
' --upgrade ' ,
' --no-cache-dir ' ,
' --no-warn-script-location ' ,
2024-08-28 10:46:45 +00:00
' yt-dlp ' ,
2021-04-24 07:28:49 +00:00
] , capture_output = False , cwd = out_dir )
pkg_path = run_shell ( [
PYTHON_BINARY , ' -m ' , ' pip ' ,
' show ' ,
2024-08-28 10:46:45 +00:00
' yt-dlp ' ,
2023-03-29 05:48:12 +00:00
] , capture_output = True , text = True , cwd = out_dir ) . stdout . decode ( ) . split ( ' Location: ' ) [ - 1 ] . split ( ' \n ' , 1 ) [ 0 ]
2024-08-28 10:46:45 +00:00
NEW_YOUTUBEDL_BINARY = Path ( pkg_path ) / ' yt-dlp ' / ' __main__.py '
2021-04-24 07:28:49 +00:00
os . chmod ( NEW_YOUTUBEDL_BINARY , 0o777 )
2024-08-28 10:46:45 +00:00
assert NEW_YOUTUBEDL_BINARY . exists ( ) , f ' yt-dlp must exist inside { pkg_path } '
2021-04-24 07:28:49 +00:00
config ( f ' YOUTUBEDL_BINARY= { NEW_YOUTUBEDL_BINARY } ' , set = True , out_dir = out_dir )
2021-06-01 00:59:05 +00:00
except BaseException as e : # lgtm [py/catch-base-exception]
2021-04-24 07:28:49 +00:00
stderr ( f ' [X] Failed to install python packages: { e } ' , color = ' red ' )
raise SystemExit ( 1 )
2021-04-05 23:51:57 +00:00
2023-10-26 20:59:25 +00:00
if platform . machine ( ) == ' armv7l ' :
stderr ( ' \n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7. ' )
2021-04-24 07:28:49 +00:00
else :
2023-10-26 20:59:25 +00:00
stderr ( ' \n Installing CHROME_BINARY automatically using playwright... ' )
if CHROME_VERSION :
print ( f ' { CHROME_VERSION } is already installed ' , CHROME_BINARY )
else :
try :
run_shell ( [
PYTHON_BINARY , ' -m ' , ' pip ' ,
' install ' ,
' --upgrade ' ,
' --no-cache-dir ' ,
' --no-warn-script-location ' ,
' playwright ' ,
] , capture_output = False , cwd = out_dir )
run_shell ( [ PYTHON_BINARY , ' -m ' , ' playwright ' , ' install ' , ' chromium ' ] , capture_output = False , cwd = out_dir )
proc = run_shell ( [ PYTHON_BINARY , ' -c ' , ' from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path) ' ] , capture_output = True , text = True , cwd = out_dir )
NEW_CHROME_BINARY = proc . stdout . decode ( ) . strip ( ) if isinstance ( proc . stdout , bytes ) else proc . stdout . strip ( )
assert NEW_CHROME_BINARY and len ( NEW_CHROME_BINARY ) , ' CHROME_BINARY must contain a path '
config ( f ' CHROME_BINARY= { NEW_CHROME_BINARY } ' , set = True , out_dir = out_dir )
except BaseException as e : # lgtm [py/catch-base-exception]
stderr ( f ' [X] Failed to install chromium using playwright: { e . __class__ . __name__ } { e } ' , color = ' red ' )
raise SystemExit ( 1 )
2021-04-05 23:51:57 +00:00
stderr ( ' \n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm... ' )
2021-04-24 07:28:49 +00:00
if not NODE_VERSION :
2023-12-18 00:14:39 +00:00
stderr ( ' [X] You must first install node & npm using your system package manager ' , color = ' red ' )
2021-04-24 07:28:49 +00:00
hint ( [
2023-12-18 00:14:39 +00:00
' https://github.com/nodesource/distributions#table-of-contents ' ,
2021-04-24 07:28:49 +00:00
' or to disable all node-based modules run: archivebox config --set USE_NODE=False ' ,
] )
raise SystemExit ( 1 )
2021-04-05 23:51:57 +00:00
2021-04-24 07:28:49 +00:00
if all ( ( SINGLEFILE_VERSION , READABILITY_VERSION , MERCURY_VERSION ) ) :
print ( ' SINGLEFILE_BINARY, READABILITY_BINARY, and MERCURURY_BINARY are already installed ' )
else :
try :
# clear out old npm package locations
paths = (
out_dir / ' package.json ' ,
out_dir / ' package_lock.json ' ,
out_dir / ' node_modules ' ,
)
for path in paths :
if path . is_dir ( ) :
shutil . rmtree ( path , ignore_errors = True )
elif path . is_file ( ) :
os . remove ( path )
2021-06-01 07:03:42 +00:00
shutil . copyfile ( PACKAGE_DIR / ' package.json ' , out_dir / ' package.json ' ) # copy the js requirements list from the source install into the data dir
# lets blindly assume that calling out to npm via shell works reliably cross-platform 🤡 (until proven otherwise via support tickets)
2021-04-24 07:28:49 +00:00
run_shell ( [
' npm ' ,
' install ' ,
2021-06-01 07:03:42 +00:00
' --prefix ' , str ( out_dir ) , # force it to put the node_modules dir in this folder
' --force ' , # overwrite any existing node_modules
' --no-save ' , # don't bother saving updating the package.json or package-lock.json file
' --no-audit ' , # don't bother checking for newer versions with security vuln fixes
' --no-fund ' , # hide "please fund our project" messages
' --loglevel ' , ' error ' , # only show erros (hide warn/info/debug) during installation
# these args are written in blood, change with caution
2021-04-24 07:28:49 +00:00
] , capture_output = False , cwd = out_dir )
os . remove ( out_dir / ' package.json ' )
2021-06-01 00:59:05 +00:00
except BaseException as e : # lgtm [py/catch-base-exception]
2021-04-24 07:28:49 +00:00
stderr ( f ' [X] Failed to install npm packages: { e } ' , color = ' red ' )
hint ( f ' Try deleting { out_dir } /node_modules and running it again ' )
raise SystemExit ( 1 )
2021-04-06 01:15:10 +00:00
stderr ( ' \n [√] Set up ArchiveBox and its dependencies successfully. ' , color = ' green ' )
2021-04-24 07:29:01 +00:00
run_shell ( [ PYTHON_BINARY , ARCHIVEBOX_BINARY , ' --version ' ] , capture_output = False , cwd = out_dir )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
2019-04-27 21:26:24 +00:00
def config ( config_options_str : Optional [ str ] = None ,
config_options : Optional [ List [ str ] ] = None ,
get : bool = False ,
set : bool = False ,
reset : bool = False ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Get and set your ArchiveBox project configuration values """
2019-04-27 21:26:24 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
if config_options and config_options_str :
stderr (
' [X] You should either pass config values as an arguments '
' or via stdin, but not both. \n ' ,
color = ' red ' ,
)
raise SystemExit ( 2 )
elif config_options_str :
2019-05-01 03:13:04 +00:00
config_options = config_options_str . split ( ' \n ' )
2019-04-27 21:26:24 +00:00
config_options = config_options or [ ]
no_args = not ( get or set or reset or config_options )
matching_config : ConfigDict = { }
if get or no_args :
if config_options :
config_options = [ get_real_name ( key ) for key in config_options ]
matching_config = { key : CONFIG [ key ] for key in config_options if key in CONFIG }
failed_config = [ key for key in config_options if key not in CONFIG ]
if failed_config :
stderr ( )
stderr ( ' [X] These options failed to get ' , color = ' red ' )
stderr ( ' {} ' . format ( ' \n ' . join ( config_options ) ) )
raise SystemExit ( 1 )
else :
matching_config = CONFIG
print ( printable_config ( matching_config ) )
raise SystemExit ( not matching_config )
elif set :
new_config = { }
failed_options = [ ]
for line in config_options :
if line . startswith ( ' # ' ) or not line . strip ( ) :
continue
if ' = ' not in line :
stderr ( ' [X] Config KEY=VALUE must have an = sign in it ' , color = ' red ' )
stderr ( f ' { line } ' )
raise SystemExit ( 2 )
2020-11-22 17:33:15 +00:00
raw_key , val = line . split ( ' = ' , 1 )
2019-04-27 21:26:24 +00:00
raw_key = raw_key . upper ( ) . strip ( )
key = get_real_name ( raw_key )
if key != raw_key :
stderr ( f ' [i] Note: The config option { raw_key } has been renamed to { key } , please use the new name going forwards. ' , color = ' lightyellow ' )
if key in CONFIG :
new_config [ key ] = val . strip ( )
else :
failed_options . append ( line )
if new_config :
before = CONFIG
matching_config = write_config_file ( new_config , out_dir = OUTPUT_DIR )
after = load_all_config ( )
print ( printable_config ( matching_config ) )
side_effect_changes : ConfigDict = { }
for key , val in after . items ( ) :
if key in USER_CONFIG and ( before [ key ] != after [ key ] ) and ( key not in matching_config ) :
side_effect_changes [ key ] = after [ key ]
if side_effect_changes :
stderr ( )
stderr ( ' [i] Note: This change also affected these other options that depended on it: ' , color = ' lightyellow ' )
print ( ' {} ' . format ( printable_config ( side_effect_changes , prefix = ' ' ) ) )
if failed_options :
stderr ( )
2020-06-30 06:04:16 +00:00
stderr ( ' [X] These options failed to set (check for typos): ' , color = ' red ' )
2019-04-27 21:26:24 +00:00
stderr ( ' {} ' . format ( ' \n ' . join ( failed_options ) ) )
2021-04-05 23:51:40 +00:00
raise SystemExit ( 1 )
2019-04-27 21:26:24 +00:00
elif reset :
stderr ( ' [X] This command is not implemented yet. ' , color = ' red ' )
stderr ( ' Please manually remove the relevant lines from your config file: ' )
stderr ( f ' { CONFIG_FILE } ' )
raise SystemExit ( 2 )
else :
stderr ( ' [X] You must pass either --get or --set, or no arguments to get the whole config. ' , color = ' red ' )
stderr ( ' archivebox config ' )
stderr ( ' archivebox config --get SOME_KEY ' )
stderr ( ' archivebox config --set SOME_KEY=SOME_VALUE ' )
raise SystemExit ( 2 )
@enforce_types
def schedule ( add : bool = False ,
show : bool = False ,
clear : bool = False ,
foreground : bool = False ,
run_all : bool = False ,
quiet : bool = False ,
every : Optional [ str ] = None ,
2024-01-04 04:31:14 +00:00
tag : str = ' ' ,
2020-08-18 05:58:54 +00:00
depth : int = 0 ,
2021-03-31 15:29:51 +00:00
overwrite : bool = False ,
2022-05-10 03:18:43 +00:00
update : bool = not ONLY_NEW ,
2019-04-27 21:26:24 +00:00
import_path : Optional [ str ] = None ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) :
2019-05-01 03:10:48 +00:00
""" Set ArchiveBox to regularly import URLs at specific times using cron """
2019-04-27 21:26:24 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
2021-03-31 15:29:51 +00:00
Path ( LOGS_DIR ) . mkdir ( exist_ok = True )
2019-04-27 21:26:24 +00:00
cron = CronTab ( user = True )
2019-05-01 03:13:04 +00:00
cron = dedupe_cron_jobs ( cron )
2019-04-27 21:26:24 +00:00
2020-08-18 08:39:58 +00:00
if clear :
print ( cron . remove_all ( comment = CRON_COMMENT ) )
cron . write ( )
raise SystemExit ( 0 )
2019-04-27 21:26:24 +00:00
existing_jobs = list ( cron . find_comment ( CRON_COMMENT ) )
2020-08-18 08:32:36 +00:00
if every or add :
every = every or ' day '
2021-03-31 15:29:51 +00:00
quoted = lambda s : f ' " { s } " ' if ( s and ' ' in str ( s ) ) else str ( s )
2019-04-27 21:26:24 +00:00
cmd = [
' cd ' ,
quoted ( out_dir ) ,
' && ' ,
quoted ( ARCHIVEBOX_BINARY ) ,
2021-03-31 15:29:51 +00:00
* ( [
' add ' ,
* ( [ ' --overwrite ' ] if overwrite else [ ] ) ,
2022-05-10 03:18:43 +00:00
* ( [ ' --update ' ] if update else [ ] ) ,
2024-01-04 04:31:14 +00:00
* ( [ f ' --tag= { tag } ' ] if tag else [ ] ) ,
2021-03-31 15:29:51 +00:00
f ' --depth= { depth } ' ,
f ' " { import_path } " ' ,
] if import_path else [ ' update ' ] ) ,
2021-03-31 15:20:42 +00:00
' >> ' ,
quoted ( Path ( LOGS_DIR ) / ' schedule.log ' ) ,
2020-08-18 08:39:58 +00:00
' 2>&1 ' ,
2019-04-27 21:26:24 +00:00
]
new_job = cron . new ( command = ' ' . join ( cmd ) , comment = CRON_COMMENT )
2020-08-18 08:32:36 +00:00
if every in ( ' minute ' , ' hour ' , ' day ' , ' month ' , ' year ' ) :
2019-04-27 21:26:24 +00:00
set_every = getattr ( new_job . every ( ) , every )
set_every ( )
elif CronSlices . is_valid ( every ) :
new_job . setall ( every )
else :
stderr ( ' {red} [X] Got invalid timeperiod for cron task. {reset} ' . format ( * * ANSI ) )
2020-08-18 08:32:36 +00:00
stderr ( ' It must be one of minute/hour/day/month ' )
2019-04-27 21:26:24 +00:00
stderr ( ' or a quoted cron-format schedule like: ' )
2021-03-31 15:29:51 +00:00
stderr ( ' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml ' )
stderr ( ' archivebox init --every= " 0/5 * * * * " --depth=1 https://example.com/some/rss/feed.xml ' )
2019-04-27 21:26:24 +00:00
raise SystemExit ( 1 )
2019-05-01 03:13:04 +00:00
cron = dedupe_cron_jobs ( cron )
2019-04-27 21:26:24 +00:00
cron . write ( )
total_runs = sum ( j . frequency_per_year ( ) for j in cron )
existing_jobs = list ( cron . find_comment ( CRON_COMMENT ) )
print ( )
print ( ' {green} [√] Scheduled new ArchiveBox cron job for user: {} ( {} jobs are active). {reset} ' . format ( USER , len ( existing_jobs ) , * * ANSI ) )
print ( ' \n ' . join ( f ' > { cmd } ' if str ( cmd ) == str ( new_job ) else f ' { cmd } ' for cmd in existing_jobs ) )
if total_runs > 60 and not quiet :
stderr ( )
stderr ( ' {lightyellow} [!] With the current cron config, ArchiveBox is estimated to run > {} times per year. {reset} ' . format ( total_runs , * * ANSI ) )
2020-07-13 15:22:07 +00:00
stderr ( ' Congrats on being an enthusiastic internet archiver! 👌 ' )
2019-04-27 21:26:24 +00:00
stderr ( )
stderr ( ' Make sure you have enough storage space available to hold all the data. ' )
stderr ( ' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot. ' )
2020-08-18 05:59:04 +00:00
stderr ( ' ' )
elif show :
if existing_jobs :
print ( ' \n ' . join ( str ( cmd ) for cmd in existing_jobs ) )
else :
stderr ( ' {red} [X] There are no ArchiveBox cron jobs scheduled for your user ( {} ). {reset} ' . format ( USER , * * ANSI ) )
stderr ( ' To schedule a new job, run: ' )
2021-03-31 15:29:51 +00:00
stderr ( ' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml ' )
2020-08-18 05:59:04 +00:00
raise SystemExit ( 0 )
cron = CronTab ( user = True )
cron = dedupe_cron_jobs ( cron )
existing_jobs = list ( cron . find_comment ( CRON_COMMENT ) )
if foreground or run_all :
if not existing_jobs :
stderr ( ' {red} [X] You must schedule some jobs first before running in foreground mode. {reset} ' . format ( * * ANSI ) )
2021-03-31 15:29:51 +00:00
stderr ( ' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml ' )
2020-08-18 05:59:04 +00:00
raise SystemExit ( 1 )
print ( ' {green} [*] Running {} ArchiveBox jobs in foreground task scheduler... {reset} ' . format ( len ( existing_jobs ) , * * ANSI ) )
if run_all :
try :
for job in existing_jobs :
2020-08-18 08:39:58 +00:00
sys . stdout . write ( f ' > { job . command . split ( " /archivebox " ) [ 0 ] . split ( " && " ) [ 0 ] } \n ' )
2021-03-31 15:30:15 +00:00
sys . stdout . write ( f ' > { job . command . split ( " /archivebox " ) [ - 1 ] . split ( " >> " ) [ 0 ] } ' )
2020-08-18 05:59:04 +00:00
sys . stdout . flush ( )
job . run ( )
2020-08-18 08:39:58 +00:00
sys . stdout . write ( f ' \r √ { job . command . split ( " /archivebox " ) [ - 1 ] } \n ' )
2020-08-18 05:59:04 +00:00
except KeyboardInterrupt :
print ( ' \n {green} [√] Stopped. {reset} ' . format ( * * ANSI ) )
raise SystemExit ( 1 )
2019-04-27 21:26:24 +00:00
2020-08-18 05:59:04 +00:00
if foreground :
try :
for job in existing_jobs :
2021-03-31 15:30:15 +00:00
print ( f ' > { job . command . split ( " /archivebox " ) [ - 1 ] . split ( " >> " ) [ 0 ] } ' )
2020-08-18 05:59:04 +00:00
for result in cron . run_scheduler ( ) :
print ( result )
except KeyboardInterrupt :
print ( ' \n {green} [√] Stopped. {reset} ' . format ( * * ANSI ) )
raise SystemExit ( 1 )
2023-11-28 04:58:13 +00:00
if CAN_UPGRADE :
2023-12-19 18:01:08 +00:00
hint ( f " There ' s a new version of ArchiveBox available! Your current version is { VERSION } . You can upgrade to { VERSIONS_AVAILABLE [ ' recommended_version ' ] [ ' tag_name ' ] } ( { VERSIONS_AVAILABLE [ ' recommended_version ' ] [ ' html_url ' ] } ). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives \n " )
2023-11-28 04:58:13 +00:00
2020-08-18 05:59:04 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
def server ( runserver_args : Optional [ List [ str ] ] = None ,
reload : bool = False ,
debug : bool = False ,
2020-07-28 09:57:34 +00:00
init : bool = False ,
2021-02-15 19:52:10 +00:00
quick_init : bool = False ,
2021-01-29 03:27:02 +00:00
createsuperuser : bool = False ,
2020-09-03 22:26:49 +00:00
out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Run the ArchiveBox HTTP server """
2019-04-27 21:26:24 +00:00
runserver_args = runserver_args or [ ]
2020-07-28 03:26:45 +00:00
2021-02-16 01:49:40 +00:00
if init :
run_subcommand ( ' init ' , stdin = None , pwd = out_dir )
2021-02-16 07:49:31 +00:00
print ( )
elif quick_init :
2021-02-16 01:49:40 +00:00
run_subcommand ( ' init ' , subcommand_args = [ ' --quick ' ] , stdin = None , pwd = out_dir )
2021-02-16 07:49:31 +00:00
print ( )
2020-07-28 09:57:34 +00:00
2021-01-29 03:27:02 +00:00
if createsuperuser :
run_subcommand ( ' manage ' , subcommand_args = [ ' createsuperuser ' ] , pwd = out_dir )
2021-02-16 07:49:31 +00:00
print ( )
2021-01-29 03:27:02 +00:00
2020-07-28 09:57:34 +00:00
# setup config for django runserver
2020-07-28 03:26:45 +00:00
from . import config
config . SHOW_PROGRESS = False
2020-07-28 09:57:34 +00:00
config . DEBUG = config . DEBUG or debug
2019-05-01 03:11:41 +00:00
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2020-07-28 09:57:34 +00:00
2019-04-27 21:26:24 +00:00
from django . core . management import call_command
from django . contrib . auth . models import User
2024-09-22 20:17:10 +00:00
print ( ' {green} [+] Starting ArchiveBox webserver... {reset} ' . format ( * * ANSI ) )
2021-02-16 07:49:31 +00:00
print ( ' > Logging errors to ./logs/errors.log ' )
if not User . objects . filter ( is_superuser = True ) . exists ( ) :
2019-04-27 21:26:24 +00:00
print ( ' {lightyellow} [!] No admin users exist yet, you will not be able to edit links in the UI. {reset} ' . format ( * * ANSI ) )
print ( )
print ( ' To create an admin user, run: ' )
print ( ' archivebox manage createsuperuser ' )
print ( )
2020-07-28 09:57:34 +00:00
2024-09-22 20:17:45 +00:00
# toggle autoreloading when archivebox code changes
2020-07-28 09:57:34 +00:00
config . SHOW_PROGRESS = False
config . DEBUG = config . DEBUG or debug
2024-09-22 20:17:45 +00:00
if debug :
if not reload :
runserver_args . append ( ' --noreload ' ) # '--insecure'
2024-09-06 09:55:06 +00:00
call_command ( " runserver " , * runserver_args )
else :
host = ' 127.0.0.1 '
port = ' 8000 '
try :
host_and_port = [ arg for arg in runserver_args if arg . replace ( ' . ' , ' ' ) . replace ( ' : ' , ' ' ) . isdigit ( ) ] [ 0 ]
if ' : ' in host_and_port :
host , port = host_and_port . split ( ' : ' )
else :
if ' . ' in host_and_port :
host = host_and_port
else :
port = host_and_port
except IndexError :
pass
2024-09-22 20:17:45 +00:00
print ( f ' > Starting ArchiveBox webserver on http:// { host } : { port } / ' )
2024-09-10 07:04:39 +00:00
from queues . supervisor_util import get_or_create_supervisord_process , start_worker , stop_worker , watch_worker
print ( )
2024-09-22 20:17:45 +00:00
2024-09-10 07:04:39 +00:00
supervisor = get_or_create_supervisord_process ( daemonize = False )
bg_workers = [
{
" name " : " worker_system_tasks " ,
" command " : " archivebox manage djangohuey --queue system_tasks " ,
" autostart " : " true " ,
" autorestart " : " true " ,
" stdout_logfile " : " logs/worker_system_tasks.log " ,
" redirect_stderr " : " true " ,
} ,
]
fg_worker = {
" name " : " worker_daphne " ,
" command " : f " daphne --bind= { host } --port= { port } --application-close-timeout=600 archivebox.core.asgi:application " ,
" autostart " : " false " ,
" autorestart " : " true " ,
" stdout_logfile " : " logs/worker_daphne.log " ,
" redirect_stderr " : " true " ,
}
print ( )
for worker in bg_workers :
start_worker ( supervisor , worker )
print ( )
start_worker ( supervisor , fg_worker )
print ( )
2024-09-06 09:55:06 +00:00
try :
2024-09-10 07:04:39 +00:00
watch_worker ( supervisor , " worker_daphne " )
except KeyboardInterrupt :
print ( " \n [🛑] Got Ctrl+C, stopping gracefully... " )
except SystemExit :
2024-09-06 09:55:06 +00:00
pass
2024-09-10 07:04:39 +00:00
except BaseException as e :
print ( f " \n [🛑] Got { e . __class__ . __name__ } exception, stopping web server gracefully... " )
raise
finally :
stop_worker ( supervisor , " worker_daphne " )
time . sleep ( 0.5 )
print ( " \n [🟩] ArchiveBox server shut down gracefully. " )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
2020-09-03 22:26:49 +00:00
def manage ( args : Optional [ List [ str ] ] = None , out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Run an ArchiveBox Django management command """
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
from django . core . management import execute_from_command_line
2020-08-10 18:15:53 +00:00
if ( args and " createsuperuser " in args ) and ( IN_DOCKER and not IS_TTY ) :
stderr ( ' [!] Warning: you need to pass -it to use interactive commands in docker ' , color = ' lightyellow ' )
stderr ( ' docker run -it archivebox manage {} ' . format ( ' ' . join ( args or [ ' ... ' ] ) ) , color = ' lightyellow ' )
2024-05-06 13:58:03 +00:00
stderr ( ' ' )
2020-08-10 18:15:53 +00:00
2019-04-27 21:26:24 +00:00
execute_from_command_line ( [ f ' { ARCHIVEBOX_BINARY } manage ' , * ( args or [ ' help ' ] ) ] )
2019-05-01 03:13:04 +00:00
@enforce_types
2020-09-03 22:26:49 +00:00
def shell ( out_dir : Path = OUTPUT_DIR ) - > None :
2019-05-01 03:13:04 +00:00
""" Enter an interactive ArchiveBox Django shell """
2024-09-25 02:04:38 +00:00
check_data_folder ( CONFIG )
2019-04-27 21:26:24 +00:00
from django . core . management import call_command
call_command ( " shell_plus " )
2020-07-29 16:19:06 +00:00