2019-04-27 21:26:24 +00:00
__package__ = ' archivebox '
import os
import sys
import shutil
2020-11-28 03:59:18 +00:00
import platform
2024-09-06 09:55:06 +00:00
from typing import Dict , List , Optional , Iterable , IO , Union
2020-08-18 16:45:27 +00:00
from pathlib import Path
2022-05-10 03:15:55 +00:00
from datetime import date , datetime
2019-04-27 21:26:24 +00:00
from crontab import CronTab , CronSlices
2024-09-06 09:55:06 +00:00
2020-08-21 17:42:08 +00:00
from django . db . models import QuerySet
2024-09-06 09:55:06 +00:00
from django . utils import timezone
2019-04-27 21:26:24 +00:00
2024-09-30 22:59:05 +00:00
from archivebox . config import CONSTANTS , VERSION , DATA_DIR , ARCHIVE_DIR , SHELL_CONFIG , SEARCH_BACKEND_CONFIG , STORAGE_CONFIG , SERVER_CONFIG , ARCHIVING_CONFIG
2019-04-27 21:26:24 +00:00
from . cli import (
2024-09-25 02:04:38 +00:00
CLI_SUBCOMMANDS ,
2019-04-27 21:26:24 +00:00
run_subcommand ,
display_first ,
meta_cmds ,
main_cmds ,
archive_cmds ,
)
2019-05-01 03:13:04 +00:00
from . parsers import (
2020-07-13 15:26:30 +00:00
save_text_as_source ,
save_file_as_source ,
2020-07-29 16:19:06 +00:00
parse_links_memory ,
2019-04-27 21:26:24 +00:00
)
2024-10-01 00:25:15 +00:00
from archivebox . misc . util import enforce_types # type: ignore
2024-10-01 00:13:55 +00:00
from archivebox . misc . system import get_dir_size , dedupe_cron_jobs , CRON_COMMENT
from archivebox . misc . system import run as run_shell
2024-10-01 00:25:15 +00:00
from . index . schema import Link
2019-04-27 21:26:24 +00:00
from . index import (
load_main_index ,
2020-07-13 15:26:30 +00:00
parse_links_from_source ,
dedupe_links ,
2019-04-27 21:26:24 +00:00
write_main_index ,
2020-08-21 17:42:08 +00:00
snapshot_filter ,
2019-04-27 21:26:24 +00:00
get_indexed_folders ,
get_archived_folders ,
get_unarchived_folders ,
get_present_folders ,
get_valid_folders ,
get_invalid_folders ,
get_duplicate_folders ,
get_orphaned_folders ,
get_corrupted_folders ,
get_unrecognized_folders ,
fix_invalid_folder_locations ,
2020-12-05 17:10:17 +00:00
write_link_details ,
2019-04-27 21:26:24 +00:00
)
from . index . json import (
parse_json_main_index ,
parse_json_links_details ,
2020-11-28 17:28:39 +00:00
generate_json_index_from_links ,
2019-04-27 21:26:24 +00:00
)
2019-05-01 03:13:04 +00:00
from . index . sql import (
get_admins ,
apply_migrations ,
2020-07-23 20:07:00 +00:00
remove_from_sql_main_index ,
2019-05-01 03:13:04 +00:00
)
2024-10-01 00:25:15 +00:00
from . index . html import generate_index_from_links
2020-11-28 18:11:15 +00:00
from . index . csv import links_to_csv
2020-07-31 15:24:58 +00:00
from . extractors import archive_links , archive_link , ignore_methods
2024-10-01 00:25:15 +00:00
from archivebox . misc . logging import stderr , hint
from archivebox . misc . checks import check_data_folder
from archivebox . config . legacy import (
2019-04-27 21:26:24 +00:00
write_config_file ,
load_all_config ,
get_real_name ,
)
2020-07-22 16:02:13 +00:00
from . logging_util import (
2019-05-01 03:13:04 +00:00
TimedProgress ,
2020-07-13 15:26:30 +00:00
log_importing_started ,
log_crawl_started ,
2019-04-27 21:26:24 +00:00
log_removal_started ,
log_removal_finished ,
log_list_started ,
log_list_finished ,
2019-05-01 03:13:04 +00:00
printable_config ,
printable_folders ,
printable_filesize ,
printable_folder_status ,
2019-04-27 21:26:24 +00:00
)
2021-04-12 21:06:32 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
2024-09-30 22:59:05 +00:00
def help ( out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Print the ArchiveBox help message and usage """
2024-10-03 02:46:31 +00:00
from rich import print
from rich . panel import Panel
2024-09-25 02:04:38 +00:00
all_subcommands = CLI_SUBCOMMANDS
2019-04-27 21:26:24 +00:00
COMMANDS_HELP_TEXT = ' \n ' . join (
2024-10-03 02:46:31 +00:00
f ' [green] { cmd . ljust ( 20 ) } [/green] { func . __doc__ } '
2024-10-01 01:29:17 +00:00
for cmd , func in all_subcommands . items ( )
2019-04-27 21:26:24 +00:00
if cmd in meta_cmds
) + ' \n \n ' + ' \n ' . join (
2024-10-03 02:46:31 +00:00
f ' [green] { cmd . ljust ( 20 ) } [/green] { func . __doc__ } '
2024-10-01 01:29:17 +00:00
for cmd , func in all_subcommands . items ( )
2019-04-27 21:26:24 +00:00
if cmd in main_cmds
) + ' \n \n ' + ' \n ' . join (
2024-10-03 02:46:31 +00:00
f ' [green] { cmd . ljust ( 20 ) } [/green] { func . __doc__ } '
2024-10-01 01:29:17 +00:00
for cmd , func in all_subcommands . items ( )
2019-04-27 21:26:24 +00:00
if cmd in archive_cmds
) + ' \n \n ' + ' \n ' . join (
2024-10-03 02:46:31 +00:00
f ' [green] { cmd . ljust ( 20 ) } [/green] { func . __doc__ } '
2024-10-01 01:29:17 +00:00
for cmd , func in all_subcommands . items ( )
2019-04-27 21:26:24 +00:00
if cmd not in display_first
)
2024-10-03 02:46:31 +00:00
DOCKER_USAGE = '''
[ dodger_blue3 ] Docker Usage : [ / dodger_blue3 ]
[ grey53 ] # using Docker Compose:[/grey53]
[ blue ] docker compose run [ / blue ] [ dark_green ] archivebox [ / dark_green ] [ green ] \\[ command ] [ / green ] [ green3 ] [ . . . args ] [ / green3 ] [ violet ] [ - - help ] [ / violet ] [ grey53 ] [ - - version ] [ / grey53 ]
[ grey53 ] # using Docker:[/grey53]
[ blue ] docker run [ / blue ] - v [ light_slate_blue ] $ PWD : / data [ / light_slate_blue ] [ grey53 ] - p 8000 : 8000 [ / grey53 ] - it [ dark_green ] archivebox / archivebox [ / dark_green ] [ green ] \\[ command ] [ / green ] [ green3 ] [ . . . args ] [ / green3 ] [ violet ] [ - - help ] [ / violet ] [ grey53 ] [ - - version ] [ / grey53 ]
''' if SHELL_CONFIG.IN_DOCKER else ' '
DOCKER_DOCS = ' \n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link] ' if SHELL_CONFIG . IN_DOCKER else ' '
DOCKER_OUTSIDE_HINT = " \n [grey53]# outside of Docker:[/grey53] " if SHELL_CONFIG . IN_DOCKER else ' '
DOCKER_CMD_PREFIX = " [blue]docker ... [/blue] " if SHELL_CONFIG . IN_DOCKER else ' '
print ( f ''' { DOCKER_USAGE }
[ deep_sky_blue4 ] Usage : [ / deep_sky_blue4 ] { DOCKER_OUTSIDE_HINT }
[ dark_green ] archivebox [ / dark_green ] [ green ] \\[ command ] [ / green ] [ green3 ] [ . . . args ] [ / green3 ] [ violet ] [ - - help ] [ / violet ] [ grey53 ] [ - - version ] [ / grey53 ]
[ deep_sky_blue4 ] Commands : [ / deep_sky_blue4 ]
{ COMMANDS_HELP_TEXT }
[ deep_sky_blue4 ] Documentation : [ / deep_sky_blue4 ]
[ link = https : / / github . com / ArchiveBox / ArchiveBox / wiki ] https : / / github . com / ArchiveBox / ArchiveBox / wiki [ / link ] { DOCKER_DOCS }
[ link = https : / / github . com / ArchiveBox / ArchiveBox / wiki / Usage #cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
[ link = https : / / github . com / ArchiveBox / ArchiveBox / wiki / Configuration ] https : / / github . com / ArchiveBox / ArchiveBox / wiki / Configuration [ / link ]
''' )
2019-04-27 21:26:24 +00:00
2024-10-04 08:40:41 +00:00
if CONSTANTS . ARCHIVE_DIR . exists ( ) :
2024-10-03 02:46:31 +00:00
pretty_out_dir = str ( out_dir ) . replace ( str ( Path ( ' ~ ' ) . expanduser ( ) ) , ' ~ ' )
EXAMPLE_USAGE = f '''
[ light_slate_blue ] DATA DIR [ / light_slate_blue ] : [ yellow ] { pretty_out_dir } [ / yellow ]
[ violet ] Hint : [ / violet ] [ i ] Common maintenance tasks : [ / i ]
[ dark_green ] archivebox [ / dark_green ] [ green ] init [ / green ] [ grey53 ] # make sure database is up-to-date (safe to run multiple times)[/grey53]
[ dark_green ] archivebox [ / dark_green ] [ green ] install [ / green ] [ grey53 ] # make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53]
[ dark_green ] archivebox [ / dark_green ] [ green ] status [ / green ] [ grey53 ] # get a health checkup report on your collection[/grey53]
[ dark_green ] archivebox [ / dark_green ] [ green ] update [ / green ] [ grey53 ] # retry any previously failed or interrupted archiving tasks[/grey53]
[ violet ] Hint : [ / violet ] [ i ] More example usage : [ / i ]
[ dark_green ] archivebox [ / dark_green ] [ green ] add [ / green ] - - depth = 1 " https://example.com/some/page "
[ dark_green ] archivebox [ / dark_green ] [ green ] list [ / green ] - - sort = timestamp - - csv = timestamp , downloaded_at , url , title
[ dark_green ] archivebox [ / dark_green ] [ green ] schedule [ / green ] - - every = day - - depth = 1 " https://example.com/some/feed.rss "
[ dark_green ] archivebox [ / dark_green ] [ green ] server [ / green ] [ blue ] 0.0 .0 .0 : 8000 [ / blue ] [ grey53 ] # Start the Web UI / API server[/grey53]
'''
print ( Panel ( EXAMPLE_USAGE , expand = False , border_style = ' grey53 ' , title = ' [green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3] ' , subtitle = ' Commands run inside this dir will only apply to this collection. ' ) )
2019-04-27 21:26:24 +00:00
else :
2024-10-03 02:46:31 +00:00
DATA_SETUP_HELP = ' \n '
2024-09-30 22:59:05 +00:00
if SHELL_CONFIG . IN_DOCKER :
2024-10-03 02:46:31 +00:00
DATA_SETUP_HELP + = ' [violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir: \n '
DATA_SETUP_HELP + = ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ... \n \n '
DATA_SETUP_HELP + = ' To load an [dark_blue]existing[/dark_blue] collection: \n '
DATA_SETUP_HELP + = ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53] \n '
DATA_SETUP_HELP + = f ' 2. { DOCKER_CMD_PREFIX } [dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53] \n '
DATA_SETUP_HELP + = f ' 3. { DOCKER_CMD_PREFIX } [dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53] \n '
DATA_SETUP_HELP + = f ' 4. { DOCKER_CMD_PREFIX } [dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53] \n \n '
DATA_SETUP_HELP + = ' To start a [sea_green1]new[/sea_green1] collection: \n '
DATA_SETUP_HELP + = ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53] \n '
DATA_SETUP_HELP + = ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53] \n '
DATA_SETUP_HELP + = f ' 3. { DOCKER_CMD_PREFIX } [dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53] \n '
DATA_SETUP_HELP + = f ' 4. { DOCKER_CMD_PREFIX } [dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53] \n '
DATA_SETUP_HELP + = f ' 5. { DOCKER_CMD_PREFIX } [dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53] \n '
print ( Panel ( DATA_SETUP_HELP , expand = False , border_style = ' grey53 ' , title = ' [red]:cross_mark: No collection is currently active[/red] ' , subtitle = ' All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue] ' ) )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
def version ( quiet : bool = False ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Print the ArchiveBox version and dependency information """
2022-06-09 01:41:22 +00:00
2024-10-01 01:33:43 +00:00
print ( VERSION )
2024-10-03 10:11:23 +00:00
if quiet or ' --version ' in sys . argv :
2024-10-01 01:33:43 +00:00
return
2024-10-03 10:11:23 +00:00
from rich . console import Console
console = Console ( )
prnt = console . print
2024-10-01 01:13:05 +00:00
2024-09-25 07:42:26 +00:00
from plugins_auth . ldap . apps import LDAP_CONFIG
from django . conf import settings
2024-10-01 01:33:43 +00:00
# 0.7.1
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
2024-09-25 07:42:26 +00:00
2024-10-01 01:33:43 +00:00
p = platform . uname ( )
2024-10-03 10:11:23 +00:00
prnt (
' [dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v {} [/dark_goldenrod] ' . format ( CONSTANTS . VERSION ) ,
2024-10-01 01:33:43 +00:00
f ' COMMIT_HASH= { SHELL_CONFIG . COMMIT_HASH [ : 7 ] if SHELL_CONFIG . COMMIT_HASH else " unknown " } ' ,
f ' BUILD_TIME= { SHELL_CONFIG . BUILD_TIME } ' ,
)
2024-10-03 10:11:23 +00:00
prnt (
2024-10-01 01:33:43 +00:00
f ' IN_DOCKER= { SHELL_CONFIG . IN_DOCKER } ' ,
f ' IN_QEMU= { SHELL_CONFIG . IN_QEMU } ' ,
f ' ARCH= { p . machine } ' ,
f ' OS= { p . system } ' ,
f ' PLATFORM= { platform . platform ( ) } ' ,
f ' PYTHON= { sys . implementation . name . title ( ) } ' ,
)
OUTPUT_IS_REMOTE_FS = CONSTANTS . DATA_LOCATIONS . DATA_DIR . is_mount or CONSTANTS . DATA_LOCATIONS . ARCHIVE_DIR . is_mount
2024-10-03 10:11:23 +00:00
prnt (
2024-10-01 01:33:43 +00:00
f ' FS_ATOMIC= { STORAGE_CONFIG . ENFORCE_ATOMIC_WRITES } ' ,
f ' FS_REMOTE= { OUTPUT_IS_REMOTE_FS } ' ,
f ' FS_USER= { SHELL_CONFIG . PUID } : { SHELL_CONFIG . PGID } ' ,
f ' FS_PERMS= { STORAGE_CONFIG . OUTPUT_PERMISSIONS } ' ,
)
2024-10-03 10:11:23 +00:00
prnt (
2024-10-01 01:33:43 +00:00
f ' DEBUG= { SHELL_CONFIG . DEBUG } ' ,
f ' IS_TTY= { SHELL_CONFIG . IS_TTY } ' ,
f ' TZ= { CONSTANTS . TIMEZONE } ' ,
f ' SEARCH_BACKEND= { SEARCH_BACKEND_CONFIG . SEARCH_BACKEND_ENGINE } ' ,
f ' LDAP= { LDAP_CONFIG . LDAP_ENABLED } ' ,
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
)
2024-10-03 10:11:23 +00:00
prnt ( )
2019-04-27 21:26:24 +00:00
2024-10-03 10:11:23 +00:00
prnt ( ' [pale_green1][i] Dependency versions:[/pale_green1] ' )
2024-10-01 01:33:43 +00:00
for name , binary in reversed ( list ( settings . BINARIES . items ( ) ) ) :
if binary . name == ' archivebox ' :
continue
err = None
try :
loaded_bin = binary . load ( )
except Exception as e :
err = e
loaded_bin = binary
provider_summary = f ' [dark_sea_green3] { loaded_bin . binprovider . name . ljust ( 10 ) } [/dark_sea_green3] ' if loaded_bin . binprovider else ' [grey23]not found[/grey23] '
2024-10-01 06:52:00 +00:00
if loaded_bin . abspath :
2024-10-03 10:11:23 +00:00
abspath = str ( loaded_bin . abspath ) . replace ( str ( DATA_DIR ) , ' [light_slate_blue].[/light_slate_blue] ' ) . replace ( str ( Path ( ' ~ ' ) . expanduser ( ) ) , ' ~ ' )
2024-10-01 06:52:00 +00:00
if ' ' in abspath :
abspath = abspath . replace ( ' ' , r ' \ ' )
else :
abspath = f ' [red] { err } [/red] '
2024-10-03 10:11:23 +00:00
prnt ( ' ' , ' [green]√[/green] ' if loaded_bin . is_valid else ' [red]X[/red] ' , ' ' , loaded_bin . name . ljust ( 21 ) , str ( loaded_bin . version ) . ljust ( 12 ) , provider_summary , abspath , overflow = ' ignore ' , crop = False )
2019-04-27 21:26:24 +00:00
2024-10-03 10:11:23 +00:00
prnt ( )
prnt ( ' [deep_sky_blue3][i] Source-code locations:[/deep_sky_blue3] ' )
2024-10-01 01:33:43 +00:00
for name , path in CONSTANTS . CODE_LOCATIONS . items ( ) :
2024-10-03 10:11:23 +00:00
prnt ( printable_folder_status ( name , path ) , overflow = ' ignore ' , crop = False )
2019-04-27 21:26:24 +00:00
2024-10-03 10:11:23 +00:00
prnt ( )
2024-10-04 08:40:41 +00:00
if CONSTANTS . ARCHIVE_DIR . exists ( ) or CONSTANTS . CONFIG_FILE . exists ( ) :
2024-10-03 10:11:23 +00:00
prnt ( ' [bright_yellow][i] Data locations:[/bright_yellow] ' )
2024-10-01 01:33:43 +00:00
for name , path in CONSTANTS . DATA_LOCATIONS . items ( ) :
2024-10-03 10:11:23 +00:00
prnt ( printable_folder_status ( name , path ) , overflow = ' ignore ' , crop = False )
2024-10-01 01:33:43 +00:00
else :
2024-10-03 10:11:23 +00:00
prnt ( )
prnt ( ' [red][i] Data locations:[/red] (not in a data directory) ' )
2024-10-01 01:33:43 +00:00
2024-10-03 10:11:23 +00:00
prnt ( )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
def run ( subcommand : str ,
subcommand_args : Optional [ List [ str ] ] ,
stdin : Optional [ IO ] = None ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Run a given ArchiveBox subcommand with the given list of args """
2019-04-27 21:26:24 +00:00
run_subcommand (
subcommand = subcommand ,
subcommand_args = subcommand_args ,
stdin = stdin ,
2019-05-01 03:10:48 +00:00
pwd = out_dir ,
2019-04-27 21:26:24 +00:00
)
2019-05-01 03:10:48 +00:00
@enforce_types
2024-10-01 06:19:11 +00:00
def init ( force : bool = False , quick : bool = False , install : bool = False , out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Initialize a new ArchiveBox collection in the current directory """
2021-02-01 10:13:46 +00:00
2020-12-11 21:27:15 +00:00
from core . models import Snapshot
2024-10-05 04:09:29 +00:00
from rich import print
2021-02-01 10:13:46 +00:00
2021-04-06 01:15:32 +00:00
out_dir . mkdir ( exist_ok = True )
2024-09-30 22:59:05 +00:00
is_empty = not len ( set ( os . listdir ( out_dir ) ) - CONSTANTS . ALLOWED_IN_DATA_DIR )
2020-08-18 16:45:27 +00:00
2024-09-30 22:59:05 +00:00
if ( out_dir / CONSTANTS . JSON_INDEX_FILENAME ) . exists ( ) :
2024-10-05 04:34:19 +00:00
print ( " [red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red] " , file = sys . stderr )
print ( " [red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red] " , file = sys . stderr )
2020-09-08 15:53:11 +00:00
2024-09-30 22:59:05 +00:00
existing_index = CONSTANTS . DATABASE_FILE . exists ( )
2019-04-27 21:26:24 +00:00
if is_empty and not existing_index :
2024-10-05 04:34:19 +00:00
print ( f ' [turquoise4][+] Initializing a new ArchiveBox v { VERSION } collection...[/turquoise4] ' )
print ( ' [green]----------------------------------------------------------------------[/green] ' )
2019-04-27 21:26:24 +00:00
elif existing_index :
2021-02-18 07:34:42 +00:00
# TODO: properly detect and print the existing version in current index as well
2024-10-05 04:34:19 +00:00
print ( f ' [green][*] Verifying and updating existing ArchiveBox collection to v { VERSION } ...[/green] ' )
print ( ' [green]----------------------------------------------------------------------[/green] ' )
2019-04-27 21:26:24 +00:00
else :
2019-05-01 06:27:50 +00:00
if force :
2024-10-05 04:34:19 +00:00
print ( ' [red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red] ' )
print ( ' [red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red] ' )
2019-05-01 06:27:50 +00:00
else :
2024-10-05 04:34:19 +00:00
print (
( " [red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red] \n \n "
2019-05-01 06:27:50 +00:00
" You must run init in a completely empty directory, or an existing data folder. \n \n "
2024-10-05 04:34:19 +00:00
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n "
2019-05-01 06:27:50 +00:00
" then run and run ' archivebox init ' to pick up where you left off. \n \n "
" (Always make sure your data folder is backed up first before updating ArchiveBox) "
2024-10-05 04:34:19 +00:00
)
2019-05-01 06:27:50 +00:00
)
raise SystemExit ( 2 )
2019-04-27 21:26:24 +00:00
if existing_index :
2024-10-05 04:34:19 +00:00
print ( ' \n [green][*] Verifying archive folder structure...[/green] ' )
2019-04-27 21:26:24 +00:00
else :
2024-10-05 04:34:19 +00:00
print ( ' \n [green][+] Building archive folder structure...[/green] ' )
2019-04-27 21:26:24 +00:00
2024-09-30 22:59:05 +00:00
print ( f ' + ./ { CONSTANTS . ARCHIVE_DIR . relative_to ( DATA_DIR ) } , ./ { CONSTANTS . SOURCES_DIR . relative_to ( DATA_DIR ) } , ./ { CONSTANTS . LOGS_DIR . relative_to ( DATA_DIR ) } ... ' )
2024-09-25 12:10:09 +00:00
Path ( CONSTANTS . SOURCES_DIR ) . mkdir ( exist_ok = True )
Path ( CONSTANTS . ARCHIVE_DIR ) . mkdir ( exist_ok = True )
Path ( CONSTANTS . LOGS_DIR ) . mkdir ( exist_ok = True )
2024-09-30 22:59:05 +00:00
print ( f ' + ./ { CONSTANTS . CONFIG_FILE . relative_to ( DATA_DIR ) } ... ' )
2024-10-05 04:34:19 +00:00
write_config_file ( { } , out_dir = str ( out_dir ) )
2021-02-16 07:49:31 +00:00
2024-09-25 12:10:09 +00:00
if CONSTANTS . DATABASE_FILE . exists ( ) :
2024-10-05 04:34:19 +00:00
print ( ' \n [green][*] Verifying main SQL index and running any migrations needed...[/green] ' )
2019-04-27 21:26:24 +00:00
else :
2024-10-05 04:34:19 +00:00
print ( ' \n [green][+] Building main SQL index and running initial migrations...[/green] ' )
2019-04-27 21:26:24 +00:00
for migration_line in apply_migrations ( out_dir ) :
2024-10-05 04:34:19 +00:00
sys . stdout . write ( f ' { migration_line } \n ' )
2019-04-27 21:26:24 +00:00
2024-09-25 12:10:09 +00:00
assert CONSTANTS . DATABASE_FILE . exists ( )
2021-02-18 07:34:42 +00:00
print ( )
2024-09-30 22:59:05 +00:00
print ( f ' √ ./ { CONSTANTS . DATABASE_FILE . relative_to ( DATA_DIR ) } ' )
2019-04-27 21:26:24 +00:00
# from django.contrib.auth.models import User
2024-09-30 22:59:05 +00:00
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
2019-04-27 21:26:24 +00:00
# call_command("createsuperuser", interactive=True)
print ( )
2024-10-05 04:34:19 +00:00
print ( ' [dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3] ' )
2019-04-27 21:26:24 +00:00
2020-12-11 21:27:15 +00:00
all_links = Snapshot . objects . none ( )
2020-08-20 17:59:50 +00:00
pending_links : Dict [ str , Link ] = { }
2019-04-27 21:26:24 +00:00
if existing_index :
2020-08-20 17:59:50 +00:00
all_links = load_main_index ( out_dir = out_dir , warn = False )
2024-10-05 04:34:19 +00:00
print ( f ' √ Loaded { all_links . count ( ) } links from existing main index. ' )
2019-04-27 21:26:24 +00:00
2021-02-16 07:49:31 +00:00
if quick :
print ( ' > Skipping full snapshot directory check (quick mode) ' )
else :
2021-02-18 07:34:42 +00:00
try :
# Links in data folders that dont match their timestamp
fixed , cant_fix = fix_invalid_folder_locations ( out_dir = out_dir )
if fixed :
2024-10-05 04:34:19 +00:00
print ( f ' [yellow]√ Fixed { len ( fixed ) } data directory locations that didn \' t match their link timestamps.[/yellow] ' )
2021-02-18 07:34:42 +00:00
if cant_fix :
2024-10-05 04:34:19 +00:00
print ( f ' [red]! Could not fix { len ( cant_fix ) } data directory locations due to conflicts with existing folders.[/red] ' )
2021-02-18 07:34:42 +00:00
# Links in JSON index but not in main index
orphaned_json_links = {
link . url : link
for link in parse_json_main_index ( out_dir )
if not all_links . filter ( url = link . url ) . exists ( )
}
if orphaned_json_links :
pending_links . update ( orphaned_json_links )
2024-10-05 04:34:19 +00:00
print ( f ' [yellow]√ Added { len ( orphaned_json_links ) } orphaned links from existing JSON index...[/yellow] ' )
2021-02-18 07:34:42 +00:00
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link . url : link
for link in parse_json_links_details ( out_dir )
if not all_links . filter ( url = link . url ) . exists ( )
}
if orphaned_data_dir_links :
pending_links . update ( orphaned_data_dir_links )
2024-10-05 04:34:19 +00:00
print ( f ' [yellow]√ Added { len ( orphaned_data_dir_links ) } orphaned links from existing archive directories.[/yellow] ' )
2021-02-18 07:34:42 +00:00
# Links in invalid/duplicate data dirs
invalid_folders = {
folder : link
for folder , link in get_invalid_folders ( all_links , out_dir = out_dir ) . items ( )
}
if invalid_folders :
2024-10-05 04:34:19 +00:00
print ( f ' [red]! Skipped adding { len ( invalid_folders ) } invalid link data directories.[/red] ' )
2024-09-30 22:59:05 +00:00
print ( ' X ' + ' \n X ' . join ( f ' ./ { Path ( folder ) . relative_to ( DATA_DIR ) } { link } ' for folder , link in invalid_folders . items ( ) ) )
2021-02-18 07:34:42 +00:00
print ( )
2024-10-05 04:34:19 +00:00
print ( ' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run: ' )
2021-02-18 07:34:42 +00:00
print ( ' archivebox status ' )
print ( ' archivebox list --status=invalid ' )
except ( KeyboardInterrupt , SystemExit ) :
2024-10-05 04:34:19 +00:00
print ( file = sys . stderr )
print ( ' [yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow] ' , file = sys . stderr )
print ( ' Your archive data is safe, but you should re-run `archivebox init` to finish the process later. ' , file = sys . stderr )
print ( file = sys . stderr )
print ( ' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so: ' , file = sys . stderr )
print ( ' archivebox init --quick ' , file = sys . stderr )
2021-02-18 07:34:42 +00:00
raise SystemExit ( 1 )
2021-02-15 19:52:10 +00:00
write_main_index ( list ( pending_links . values ( ) ) , out_dir = out_dir )
2019-04-27 21:26:24 +00:00
2024-10-05 04:34:19 +00:00
print ( ' \n [green]----------------------------------------------------------------------[/green] ' )
2023-10-18 08:07:54 +00:00
from django . contrib . auth . models import User
2024-09-25 07:42:26 +00:00
if ( SERVER_CONFIG . ADMIN_USERNAME and SERVER_CONFIG . ADMIN_PASSWORD ) and not User . objects . filter ( username = SERVER_CONFIG . ADMIN_USERNAME ) . exists ( ) :
2024-10-05 04:34:19 +00:00
print ( ' [green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green] ' )
2024-09-25 07:42:26 +00:00
User . objects . create_superuser ( username = SERVER_CONFIG . ADMIN_USERNAME , password = SERVER_CONFIG . ADMIN_PASSWORD )
2023-10-18 08:07:54 +00:00
2019-04-27 21:26:24 +00:00
if existing_index :
2024-10-05 04:34:19 +00:00
print ( ' [green][√] Done. Verified and updated the existing ArchiveBox collection.[/green] ' )
2019-04-27 21:26:24 +00:00
else :
2024-10-05 04:34:19 +00:00
print ( f ' [green][√] Done. A new ArchiveBox collection was initialized ( { len ( all_links ) + len ( pending_links ) } links).[/green] ' )
2019-04-27 21:26:24 +00:00
2024-09-30 22:59:05 +00:00
json_index = out_dir / CONSTANTS . JSON_INDEX_FILENAME
html_index = out_dir / CONSTANTS . HTML_INDEX_FILENAME
2020-10-19 15:18:11 +00:00
index_name = f " { date . today ( ) } _index_old "
if json_index . exists ( ) :
json_index . rename ( f " { index_name } .json " )
if html_index . exists ( ) :
html_index . rename ( f " { index_name } .html " )
2024-10-01 06:19:11 +00:00
if install :
run_subcommand ( ' install ' , pwd = out_dir )
2021-04-06 03:21:07 +00:00
2021-04-06 03:17:07 +00:00
if Snapshot . objects . count ( ) < 25 : # hide the hints for experienced users
print ( )
2024-10-05 04:34:19 +00:00
print ( ' [violet]Hint:[/violet] To view your archive index, run: ' )
2024-10-05 04:09:29 +00:00
print ( ' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4] ' )
2021-04-06 03:17:07 +00:00
print ( )
print ( ' To add new links, you can run: ' )
2022-04-19 21:25:49 +00:00
print ( " archivebox add < ~/some/path/to/list_of_links.txt " )
2021-04-06 03:17:07 +00:00
print ( )
print ( ' For more usage and examples, run: ' )
print ( ' archivebox help ' )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
2024-09-30 22:59:05 +00:00
def status ( out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Print out some info and statistics about the archive collection """
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
2020-06-26 03:32:01 +00:00
from core . models import Snapshot
2024-09-27 07:41:21 +00:00
from django . contrib . auth import get_user_model
2020-06-26 03:32:01 +00:00
User = get_user_model ( )
2024-09-30 22:59:05 +00:00
print ( ' {green} [*] Scanning archive main index... {reset} ' . format ( * * SHELL_CONFIG . ANSI ) )
print ( SHELL_CONFIG . ANSI [ ' lightyellow ' ] , f ' { out_dir } /* ' , SHELL_CONFIG . ANSI [ ' reset ' ] )
2019-04-27 21:26:24 +00:00
num_bytes , num_dirs , num_files = get_dir_size ( out_dir , recursive = False , pattern = ' index. ' )
2019-05-01 03:13:04 +00:00
size = printable_filesize ( num_bytes )
2020-06-26 03:32:01 +00:00
print ( f ' Index size: { size } across { num_files } files ' )
2019-04-27 21:26:24 +00:00
print ( )
2020-08-20 14:18:25 +00:00
links = load_main_index ( out_dir = out_dir )
num_sql_links = links . count ( )
2019-04-27 21:26:24 +00:00
num_link_details = sum ( 1 for link in parse_json_links_details ( out_dir = out_dir ) )
2024-09-30 22:59:05 +00:00
print ( f ' > SQL Main Index: { num_sql_links } links ' . ljust ( 36 ) , f ' (found in { CONSTANTS . SQL_INDEX_FILENAME } ) ' )
2024-09-25 12:10:09 +00:00
print ( f ' > JSON Link Details: { num_link_details } links ' . ljust ( 36 ) , f ' (found in { ARCHIVE_DIR . name } /*/index.json) ' )
2019-04-27 21:26:24 +00:00
print ( )
2024-09-30 22:59:05 +00:00
print ( ' {green} [*] Scanning archive data directories... {reset} ' . format ( * * SHELL_CONFIG . ANSI ) )
print ( SHELL_CONFIG . ANSI [ ' lightyellow ' ] , f ' { ARCHIVE_DIR } /* ' , SHELL_CONFIG . ANSI [ ' reset ' ] )
2019-04-27 21:26:24 +00:00
num_bytes , num_dirs , num_files = get_dir_size ( ARCHIVE_DIR )
2019-05-01 03:13:04 +00:00
size = printable_filesize ( num_bytes )
2019-04-27 21:26:24 +00:00
print ( f ' Size: { size } across { num_files } files in { num_dirs } directories ' )
2024-09-30 22:59:05 +00:00
print ( SHELL_CONFIG . ANSI [ ' black ' ] )
2019-04-27 21:26:24 +00:00
num_indexed = len ( get_indexed_folders ( links , out_dir = out_dir ) )
num_archived = len ( get_archived_folders ( links , out_dir = out_dir ) )
num_unarchived = len ( get_unarchived_folders ( links , out_dir = out_dir ) )
print ( f ' > indexed: { num_indexed } ' . ljust ( 36 ) , f ' ( { get_indexed_folders . __doc__ } ) ' )
print ( f ' > archived: { num_archived } ' . ljust ( 36 ) , f ' ( { get_archived_folders . __doc__ } ) ' )
print ( f ' > unarchived: { num_unarchived } ' . ljust ( 36 ) , f ' ( { get_unarchived_folders . __doc__ } ) ' )
num_present = len ( get_present_folders ( links , out_dir = out_dir ) )
num_valid = len ( get_valid_folders ( links , out_dir = out_dir ) )
print ( )
print ( f ' > present: { num_present } ' . ljust ( 36 ) , f ' ( { get_present_folders . __doc__ } ) ' )
print ( f ' > valid: { num_valid } ' . ljust ( 36 ) , f ' ( { get_valid_folders . __doc__ } ) ' )
duplicate = get_duplicate_folders ( links , out_dir = out_dir )
orphaned = get_orphaned_folders ( links , out_dir = out_dir )
corrupted = get_corrupted_folders ( links , out_dir = out_dir )
unrecognized = get_unrecognized_folders ( links , out_dir = out_dir )
num_invalid = len ( { * * duplicate , * * orphaned , * * corrupted , * * unrecognized } )
print ( f ' > invalid: { num_invalid } ' . ljust ( 36 ) , f ' ( { get_invalid_folders . __doc__ } ) ' )
print ( f ' > duplicate: { len ( duplicate ) } ' . ljust ( 36 ) , f ' ( { get_duplicate_folders . __doc__ } ) ' )
print ( f ' > orphaned: { len ( orphaned ) } ' . ljust ( 36 ) , f ' ( { get_orphaned_folders . __doc__ } ) ' )
print ( f ' > corrupted: { len ( corrupted ) } ' . ljust ( 36 ) , f ' ( { get_corrupted_folders . __doc__ } ) ' )
print ( f ' > unrecognized: { len ( unrecognized ) } ' . ljust ( 36 ) , f ' ( { get_unrecognized_folders . __doc__ } ) ' )
2020-06-26 03:32:01 +00:00
2024-09-30 22:59:05 +00:00
print ( SHELL_CONFIG . ANSI [ ' reset ' ] )
2020-06-26 03:32:01 +00:00
2019-04-27 21:26:24 +00:00
if num_indexed :
2024-09-30 22:59:05 +00:00
print ( ' {lightred} Hint: {reset} You can list link data directories by status like so: ' . format ( * * SHELL_CONFIG . ANSI ) )
2019-04-27 21:26:24 +00:00
print ( ' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.) ' )
if orphaned :
2024-09-30 22:59:05 +00:00
print ( ' {lightred} Hint: {reset} To automatically import orphaned data directories into the main index, run: ' . format ( * * SHELL_CONFIG . ANSI ) )
2019-04-27 21:26:24 +00:00
print ( ' archivebox init ' )
if num_invalid :
2024-09-30 22:59:05 +00:00
print ( ' {lightred} Hint: {reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run: ' . format ( * * SHELL_CONFIG . ANSI ) )
2019-04-27 21:26:24 +00:00
print ( ' archivebox init ' )
print ( )
2024-09-30 22:59:05 +00:00
print ( ' {green} [*] Scanning recent archive changes and user logins: {reset} ' . format ( * * SHELL_CONFIG . ANSI ) )
print ( SHELL_CONFIG . ANSI [ ' lightyellow ' ] , f ' { CONSTANTS . LOGS_DIR } /* ' , SHELL_CONFIG . ANSI [ ' reset ' ] )
2020-06-26 03:32:01 +00:00
users = get_admins ( ) . values_list ( ' username ' , flat = True )
print ( f ' UI users { len ( users ) } : { " , " . join ( users ) } ' )
last_login = User . objects . order_by ( ' last_login ' ) . last ( )
2020-07-24 17:25:25 +00:00
if last_login :
print ( f ' Last UI login: { last_login . username } @ { str ( last_login . last_login ) [ : 16 ] } ' )
2024-09-05 06:42:36 +00:00
last_downloaded = Snapshot . objects . order_by ( ' downloaded_at ' ) . last ( )
if last_downloaded :
print ( f ' Last changes: { str ( last_downloaded . downloaded_at ) [ : 16 ] } ' )
2020-06-26 03:32:01 +00:00
if not users :
print ( )
2024-09-30 22:59:05 +00:00
print ( ' {lightred} Hint: {reset} You can create an admin user by running: ' . format ( * * SHELL_CONFIG . ANSI ) )
2020-06-26 03:32:01 +00:00
print ( ' archivebox manage createsuperuser ' )
print ( )
2024-09-05 06:42:36 +00:00
for snapshot in links . order_by ( ' -downloaded_at ' ) [ : 10 ] :
if not snapshot . downloaded_at :
2020-06-26 03:32:01 +00:00
continue
print (
2024-09-30 22:59:05 +00:00
SHELL_CONFIG . ANSI [ ' black ' ] ,
2020-06-26 03:32:01 +00:00
(
2024-09-05 06:42:36 +00:00
f ' > { str ( snapshot . downloaded_at ) [ : 16 ] } '
2020-06-26 03:32:01 +00:00
f ' [ { snapshot . num_outputs } { ( " X " , " √ " ) [ snapshot . is_archived ] } { printable_filesize ( snapshot . archive_size ) } ] '
f ' " { snapshot . title } " : { snapshot . url } '
2024-09-25 12:10:09 +00:00
) [ : SHELL_CONFIG . TERM_WIDTH ] ,
2024-09-30 22:59:05 +00:00
SHELL_CONFIG . ANSI [ ' reset ' ] ,
2020-06-26 03:32:01 +00:00
)
2024-09-30 22:59:05 +00:00
print ( SHELL_CONFIG . ANSI [ ' black ' ] , ' ... ' , SHELL_CONFIG . ANSI [ ' reset ' ] )
2019-04-27 21:26:24 +00:00
2020-07-29 16:19:06 +00:00
@enforce_types
2024-09-30 22:59:05 +00:00
def oneshot ( url : str , extractors : str = " " , out_dir : Path = DATA_DIR , created_by_id : int | None = None ) - > List [ Link ] :
2020-07-31 15:28:30 +00:00
"""
Create a single URL archive folder with an index . json and index . html , and all the archive method outputs .
You can run this to archive single pages without needing to create a whole collection with archivebox init .
"""
2020-07-31 14:05:40 +00:00
oneshot_link , _ = parse_links_memory ( [ url ] )
if len ( oneshot_link ) > 1 :
stderr (
' [X] You should pass a single url to the oneshot command ' ,
color = ' red '
)
raise SystemExit ( 2 )
2020-12-11 13:48:46 +00:00
methods = extractors . split ( " , " ) if extractors else ignore_methods ( [ ' title ' ] )
2024-08-21 02:43:07 +00:00
archive_link ( oneshot_link [ 0 ] , out_dir = out_dir , methods = methods , created_by_id = created_by_id )
2020-07-31 14:05:40 +00:00
return oneshot_link
2020-07-29 16:19:06 +00:00
2019-04-27 21:26:24 +00:00
@enforce_types
2020-07-13 15:26:30 +00:00
def add ( urls : Union [ str , List [ str ] ] ,
2021-03-27 07:57:05 +00:00
tag : str = ' ' ,
2020-07-08 13:17:47 +00:00
depth : int = 0 ,
2024-09-30 22:59:05 +00:00
update : bool = not ARCHIVING_CONFIG . ONLY_NEW ,
2022-05-10 03:15:55 +00:00
update_all : bool = False ,
2019-04-27 21:26:24 +00:00
index_only : bool = False ,
2020-08-18 08:37:54 +00:00
overwrite : bool = False ,
2021-04-10 08:19:30 +00:00
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
2020-08-18 12:22:34 +00:00
init : bool = False ,
2020-12-11 13:48:46 +00:00
extractors : str = " " ,
2021-03-20 16:38:00 +00:00
parser : str = " auto " ,
2024-08-21 02:28:28 +00:00
created_by_id : int | None = None ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > List [ Link ] :
2019-05-01 03:10:48 +00:00
""" Add a new URL or list of URLs to your archive """
2019-04-27 21:26:24 +00:00
2024-01-04 04:31:46 +00:00
from core . models import Snapshot , Tag
2024-09-25 05:01:18 +00:00
# from queues.supervisor_util import start_cli_workers, tail_worker_logs
# from queues.tasks import bg_archive_link
2024-09-25 04:17:51 +00:00
2021-03-27 07:57:05 +00:00
2020-07-13 15:26:30 +00:00
assert depth in ( 0 , 1 ) , ' Depth must be 0 or 1 (depth >1 is not supported yet) '
2019-04-27 21:26:24 +00:00
2020-11-13 18:10:17 +00:00
extractors = extractors . split ( " , " ) if extractors else [ ]
2020-11-13 16:41:50 +00:00
2020-08-18 12:22:34 +00:00
if init :
run_subcommand ( ' init ' , stdin = None , pwd = out_dir )
2020-07-13 15:26:30 +00:00
# Load list of links from the existing index
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2024-09-27 07:41:21 +00:00
2024-09-25 05:01:18 +00:00
# worker = start_cli_workers()
2024-09-25 04:17:51 +00:00
2019-04-27 21:26:24 +00:00
new_links : List [ Link ] = [ ]
2020-08-21 14:57:29 +00:00
all_links = load_main_index ( out_dir = out_dir )
2020-07-08 13:17:47 +00:00
2020-07-13 15:26:30 +00:00
log_importing_started ( urls = urls , depth = depth , index_only = index_only )
if isinstance ( urls , str ) :
# save verbatim stdin to sources
write_ahead_log = save_text_as_source ( urls , filename = ' {ts} -import.txt ' , out_dir = out_dir )
elif isinstance ( urls , list ) :
# save verbatim args to sources
write_ahead_log = save_text_as_source ( ' \n ' . join ( urls ) , filename = ' {ts} -import.txt ' , out_dir = out_dir )
2022-05-10 04:21:26 +00:00
2021-03-20 16:38:00 +00:00
new_links + = parse_links_from_source ( write_ahead_log , root_url = None , parser = parser )
2020-07-13 15:26:30 +00:00
# If we're going one level deeper, download each link and look for more links
2020-07-13 19:48:25 +00:00
new_links_depth = [ ]
2020-07-13 15:26:30 +00:00
if new_links and depth == 1 :
log_crawl_started ( new_links )
for new_link in new_links :
2022-05-10 02:56:24 +00:00
try :
downloaded_file = save_file_as_source ( new_link . url , filename = f ' { new_link . timestamp } -crawl- { new_link . domain } .txt ' , out_dir = out_dir )
new_links_depth + = parse_links_from_source ( downloaded_file , root_url = new_link . url )
except Exception as err :
stderr ( ' [!] Failed to get contents of URL {new_link.url} ' , err , color = ' red ' )
2020-08-18 08:37:54 +00:00
2020-08-18 12:29:05 +00:00
imported_links = list ( { link . url : link for link in ( new_links + new_links_depth ) } . values ( ) )
2021-03-27 07:57:05 +00:00
2020-08-21 14:57:29 +00:00
new_links = dedupe_links ( all_links , imported_links )
2024-08-21 02:28:28 +00:00
write_main_index ( links = new_links , out_dir = out_dir , created_by_id = created_by_id )
2020-08-21 14:57:29 +00:00
all_links = load_main_index ( out_dir = out_dir )
2019-04-27 21:26:24 +00:00
2024-01-04 04:31:46 +00:00
tags = [
2024-08-21 02:28:28 +00:00
Tag . objects . get_or_create ( name = name . strip ( ) , defaults = { ' created_by_id ' : created_by_id } ) [ 0 ]
2024-01-04 04:31:46 +00:00
for name in tag . split ( ' , ' )
if name . strip ( )
]
if tags :
for link in imported_links :
snapshot = Snapshot . objects . get ( url = link . url )
snapshot . tags . add ( * tags )
snapshot . tags_str ( nocache = True )
snapshot . save ( )
# print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
2019-04-27 21:26:24 +00:00
if index_only :
2021-03-27 07:57:05 +00:00
# mock archive all the links using the fake index_only extractor method in order to update their state
2021-02-16 01:49:23 +00:00
if overwrite :
2024-08-21 02:43:07 +00:00
archive_links ( imported_links , overwrite = overwrite , methods = [ ' index_only ' ] , out_dir = out_dir , created_by_id = created_by_id )
2021-02-16 01:49:23 +00:00
else :
2024-08-21 02:43:07 +00:00
archive_links ( new_links , overwrite = False , methods = [ ' index_only ' ] , out_dir = out_dir , created_by_id = created_by_id )
2021-03-27 07:57:05 +00:00
else :
# fully run the archive extractor methods for each link
archive_kwargs = {
" out_dir " : out_dir ,
2024-08-21 02:43:07 +00:00
" created_by_id " : created_by_id ,
2021-03-27 07:57:05 +00:00
}
if extractors :
archive_kwargs [ " methods " ] = extractors
2022-05-10 03:15:55 +00:00
stderr ( )
ts = datetime . now ( timezone . utc ) . strftime ( ' % Y- % m- %d % H: % M: % S ' )
if update :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving + updating { len ( imported_links ) } / { len ( all_links ) } ' , len ( imported_links ) , ' URLs from added set... ' , color = ' green ' )
2022-05-10 03:15:55 +00:00
archive_links ( imported_links , overwrite = overwrite , * * archive_kwargs )
elif update_all :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving + updating { len ( all_links ) } / { len ( all_links ) } ' , len ( all_links ) , ' URLs from entire library... ' , color = ' green ' )
2021-03-27 07:57:05 +00:00
archive_links ( all_links , overwrite = overwrite , * * archive_kwargs )
elif overwrite :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving + overwriting { len ( imported_links ) } / { len ( all_links ) } ' , len ( imported_links ) , ' URLs from added set... ' , color = ' green ' )
2021-03-27 07:57:05 +00:00
archive_links ( imported_links , overwrite = True , * * archive_kwargs )
elif new_links :
2022-05-10 04:21:26 +00:00
stderr ( f ' [*] [ { ts } ] Archiving { len ( new_links ) } / { len ( all_links ) } URLs from added set... ' , color = ' green ' )
2021-03-27 07:57:05 +00:00
archive_links ( new_links , overwrite = False , * * archive_kwargs )
2024-09-25 05:01:18 +00:00
# tail_worker_logs(worker['stdout_logfile'])
2024-09-25 04:17:51 +00:00
2024-09-25 07:42:26 +00:00
# if CAN_UPGRADE:
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
2021-03-27 08:30:15 +00:00
2024-04-25 10:56:22 +00:00
return new_links
2019-04-27 21:26:24 +00:00
@enforce_types
def remove ( filter_str : Optional [ str ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
filter_type : str = ' exact ' ,
2020-08-21 17:42:08 +00:00
snapshots : Optional [ QuerySet ] = None ,
2019-04-27 21:26:24 +00:00
after : Optional [ float ] = None ,
before : Optional [ float ] = None ,
yes : bool = False ,
delete : bool = False ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > List [ Link ] :
2019-05-01 03:10:48 +00:00
""" Remove the specified URLs from the archive """
2019-04-27 21:26:24 +00:00
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
2020-09-08 16:05:11 +00:00
if snapshots is None :
2020-07-28 09:52:15 +00:00
if filter_str and filter_patterns :
stderr (
' [X] You should pass either a pattern as an argument, '
' or pass a list of patterns via stdin, but not both. \n ' ,
color = ' red ' ,
)
raise SystemExit ( 2 )
elif not ( filter_str or filter_patterns ) :
stderr (
' [X] You should pass either a pattern as an argument, '
' or pass a list of patterns via stdin. ' ,
color = ' red ' ,
)
stderr ( )
2020-08-18 12:22:34 +00:00
hint ( ( ' To remove all urls you can run: ' ,
2020-08-21 17:42:08 +00:00
' archivebox remove --filter-type=regex " .* " ' ) )
2020-07-28 09:52:15 +00:00
stderr ( )
raise SystemExit ( 2 )
elif filter_str :
filter_patterns = [ ptn . strip ( ) for ptn in filter_str . split ( ' \n ' ) ]
2020-08-21 17:42:08 +00:00
list_kwargs = {
" filter_patterns " : filter_patterns ,
" filter_type " : filter_type ,
" after " : after ,
" before " : before ,
}
if snapshots :
list_kwargs [ " snapshots " ] = snapshots
log_list_started ( filter_patterns , filter_type )
timer = TimedProgress ( 360 , prefix = ' ' )
try :
snapshots = list_links ( * * list_kwargs )
finally :
timer . end ( )
2020-07-28 09:52:15 +00:00
2019-04-27 21:26:24 +00:00
2020-08-21 17:42:08 +00:00
if not snapshots . exists ( ) :
2019-04-27 21:26:24 +00:00
log_removal_finished ( 0 , 0 )
raise SystemExit ( 1 )
2020-08-21 17:42:08 +00:00
log_links = [ link . as_link ( ) for link in snapshots ]
log_list_finished ( log_links )
log_removal_started ( log_links , yes = yes , delete = delete )
2019-04-27 21:26:24 +00:00
timer = TimedProgress ( 360 , prefix = ' ' )
try :
2020-08-21 17:42:08 +00:00
for snapshot in snapshots :
if delete :
shutil . rmtree ( snapshot . as_link ( ) . link_dir , ignore_errors = True )
2019-04-27 21:26:24 +00:00
finally :
timer . end ( )
2020-08-21 17:42:08 +00:00
to_remove = snapshots . count ( )
2024-09-24 09:13:01 +00:00
from . search import flush_search_index
2020-11-19 23:19:33 +00:00
flush_search_index ( snapshots = snapshots )
2020-11-21 18:02:58 +00:00
remove_from_sql_main_index ( snapshots = snapshots , out_dir = out_dir )
2020-08-21 17:42:08 +00:00
all_snapshots = load_main_index ( out_dir = out_dir )
log_removal_finished ( all_snapshots . count ( ) , to_remove )
2019-04-27 21:26:24 +00:00
2020-08-21 17:42:08 +00:00
return all_snapshots
2019-04-27 21:26:24 +00:00
@enforce_types
def update ( resume : Optional [ float ] = None ,
2024-09-30 22:59:05 +00:00
only_new : bool = ARCHIVING_CONFIG . ONLY_NEW ,
2019-04-27 21:26:24 +00:00
index_only : bool = False ,
overwrite : bool = False ,
filter_patterns_str : Optional [ str ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
filter_type : Optional [ str ] = None ,
status : Optional [ str ] = None ,
after : Optional [ str ] = None ,
before : Optional [ str ] = None ,
2020-12-05 17:20:47 +00:00
extractors : str = " " ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > List [ Link ] :
2019-05-01 03:10:48 +00:00
""" Import any new links from subscriptions and retry any previously failed/skipped links """
2019-04-27 21:26:24 +00:00
2024-02-22 12:49:09 +00:00
from core . models import ArchiveResult
2024-09-24 09:13:01 +00:00
from . search import index_links
2024-09-25 05:01:18 +00:00
# from .queues.supervisor_util import start_cli_workers
2024-09-25 04:17:51 +00:00
2024-02-22 12:49:09 +00:00
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2024-09-25 05:01:18 +00:00
# start_cli_workers()
2020-08-22 13:59:25 +00:00
new_links : List [ Link ] = [ ] # TODO: Remove input argument: only_new
2019-04-27 21:26:24 +00:00
2020-12-05 17:20:47 +00:00
extractors = extractors . split ( " , " ) if extractors else [ ]
2020-08-22 13:59:25 +00:00
# Step 1: Filter for selected_links
2024-02-22 12:49:09 +00:00
print ( ' [*] Finding matching Snapshots to update... ' )
print ( f ' - Filtering by { " " . join ( filter_patterns ) } ( { filter_type } ) { before =} { after =} { status =} ... ' )
2020-08-22 13:59:25 +00:00
matching_snapshots = list_links (
2019-04-27 21:26:24 +00:00
filter_patterns = filter_patterns ,
filter_type = filter_type ,
before = before ,
after = after ,
)
2024-02-22 12:49:09 +00:00
print ( f ' - Checking { matching_snapshots . count ( ) } snapshot folders for existing data with { status =} ... ' )
2019-04-27 21:26:24 +00:00
matching_folders = list_folders (
2020-08-22 13:59:25 +00:00
links = matching_snapshots ,
2019-04-27 21:26:24 +00:00
status = status ,
out_dir = out_dir ,
)
2024-02-22 12:49:09 +00:00
all_links = ( link for link in matching_folders . values ( ) if link )
print ( ' - Sorting by most unfinished -> least unfinished + date archived... ' )
all_links = sorted ( all_links , key = lambda link : ( ArchiveResult . objects . filter ( snapshot__url = link . url ) . count ( ) , link . timestamp ) )
2019-04-27 21:26:24 +00:00
if index_only :
2020-12-05 17:10:17 +00:00
for link in all_links :
write_link_details ( link , out_dir = out_dir , skip_sql_index = True )
2020-11-23 20:51:59 +00:00
index_links ( all_links , out_dir = out_dir )
2019-04-27 21:26:24 +00:00
return all_links
2020-08-22 13:59:25 +00:00
# Step 2: Run the archive methods for each link
2020-07-13 15:26:30 +00:00
to_archive = new_links if only_new else all_links
2020-08-18 08:39:39 +00:00
if resume :
to_archive = [
link for link in to_archive
if link . timestamp > = str ( resume )
]
if not to_archive :
2020-08-18 08:42:39 +00:00
stderr ( ' ' )
stderr ( f ' [√] Nothing found to resume after { resume } ' , color = ' green ' )
2020-08-18 08:39:39 +00:00
return all_links
2020-12-05 17:20:47 +00:00
archive_kwargs = {
" out_dir " : out_dir ,
}
if extractors :
archive_kwargs [ " methods " ] = extractors
2024-02-22 12:49:09 +00:00
2020-12-05 17:20:47 +00:00
archive_links ( to_archive , overwrite = overwrite , * * archive_kwargs )
2019-04-27 21:26:24 +00:00
# Step 4: Re-write links index with updated titles, icons, and resources
2020-08-22 13:59:25 +00:00
all_links = load_main_index ( out_dir = out_dir )
2019-04-27 21:26:24 +00:00
return all_links
@enforce_types
def list_all ( filter_patterns_str : Optional [ str ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
filter_type : str = ' exact ' ,
status : Optional [ str ] = None ,
after : Optional [ float ] = None ,
before : Optional [ float ] = None ,
sort : Optional [ str ] = None ,
csv : Optional [ str ] = None ,
2019-05-01 03:10:48 +00:00
json : bool = False ,
2020-08-19 18:02:12 +00:00
html : bool = False ,
2020-09-08 14:17:10 +00:00
with_headers : bool = False ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > Iterable [ Link ] :
2019-05-01 03:10:48 +00:00
""" List, filter, and export information about archive entries """
2019-04-27 21:26:24 +00:00
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
if filter_patterns and filter_patterns_str :
stderr (
' [X] You should either pass filter patterns as an arguments '
' or via stdin, but not both. \n ' ,
color = ' red ' ,
)
raise SystemExit ( 2 )
elif filter_patterns_str :
filter_patterns = filter_patterns_str . split ( ' \n ' )
2020-08-21 17:42:08 +00:00
snapshots = list_links (
2019-04-27 21:26:24 +00:00
filter_patterns = filter_patterns ,
filter_type = filter_type ,
before = before ,
after = after ,
)
2020-08-22 14:11:17 +00:00
if sort :
snapshots = snapshots . order_by ( sort )
2019-04-27 21:26:24 +00:00
folders = list_folders (
2020-08-22 13:59:25 +00:00
links = snapshots ,
2019-04-27 21:26:24 +00:00
status = status ,
out_dir = out_dir ,
)
2020-11-28 17:28:39 +00:00
if json :
output = generate_json_index_from_links ( folders . values ( ) , with_headers )
elif html :
output = generate_index_from_links ( folders . values ( ) , with_headers )
elif csv :
output = links_to_csv ( folders . values ( ) , cols = csv . split ( ' , ' ) , header = with_headers )
else :
output = printable_folders ( folders , with_headers = with_headers )
print ( output )
2019-05-01 03:13:04 +00:00
return folders
2019-04-27 21:26:24 +00:00
@enforce_types
2020-08-21 17:42:08 +00:00
def list_links ( snapshots : Optional [ QuerySet ] = None ,
filter_patterns : Optional [ List [ str ] ] = None ,
2019-04-27 21:26:24 +00:00
filter_type : str = ' exact ' ,
after : Optional [ float ] = None ,
before : Optional [ float ] = None ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > Iterable [ Link ] :
2019-04-27 21:26:24 +00:00
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
2020-08-21 17:42:08 +00:00
if snapshots :
all_snapshots = snapshots
else :
all_snapshots = load_main_index ( out_dir = out_dir )
if after is not None :
2021-02-16 01:48:51 +00:00
all_snapshots = all_snapshots . filter ( timestamp__gte = after )
2020-08-21 17:42:08 +00:00
if before is not None :
2021-02-16 01:48:51 +00:00
all_snapshots = all_snapshots . filter ( timestamp__lt = before )
2020-08-21 17:42:08 +00:00
if filter_patterns :
all_snapshots = snapshot_filter ( all_snapshots , filter_patterns , filter_type )
2021-02-16 01:48:51 +00:00
if not all_snapshots :
stderr ( ' [!] No Snapshots matched your filters: ' , filter_patterns , f ' ( { filter_type } ) ' , color = ' lightyellow ' )
2020-08-21 17:42:08 +00:00
return all_snapshots
2019-04-27 21:26:24 +00:00
@enforce_types
def list_folders ( links : List [ Link ] ,
status : str ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > Dict [ str , Optional [ Link ] ] :
2019-04-27 21:26:24 +00:00
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
2020-08-22 13:59:25 +00:00
STATUS_FUNCTIONS = {
" indexed " : get_indexed_folders ,
" archived " : get_archived_folders ,
" unarchived " : get_unarchived_folders ,
" present " : get_present_folders ,
" valid " : get_valid_folders ,
" invalid " : get_invalid_folders ,
" duplicate " : get_duplicate_folders ,
" orphaned " : get_orphaned_folders ,
" corrupted " : get_corrupted_folders ,
" unrecognized " : get_unrecognized_folders ,
}
try :
return STATUS_FUNCTIONS [ status ] ( links , out_dir = out_dir )
except KeyError :
raise ValueError ( ' Status not recognized. ' )
2019-04-27 21:26:24 +00:00
2021-04-05 23:51:57 +00:00
@enforce_types
2024-10-01 06:21:34 +00:00
def install ( out_dir : Path = DATA_DIR ) - > None :
2021-04-05 23:51:57 +00:00
""" Automatically install all ArchiveBox dependencies and extras """
2024-09-25 08:15:15 +00:00
from rich import print
2024-10-01 04:44:23 +00:00
from django . conf import settings
2024-09-25 07:42:26 +00:00
2024-09-25 12:10:09 +00:00
if not ARCHIVE_DIR . exists ( ) :
2021-04-06 01:15:10 +00:00
run_subcommand ( ' init ' , stdin = None , pwd = out_dir )
2024-09-25 08:15:15 +00:00
stderr ( ' \n [+] Installing ArchiveBox dependencies automatically... ' , color = ' green ' )
2021-04-05 23:51:57 +00:00
2024-10-01 06:21:34 +00:00
for binary in reversed ( list ( settings . BINARIES . values ( ) ) ) :
2024-10-03 10:11:23 +00:00
providers = ' [grey53]or[/grey53] ' . join ( provider . name for provider in binary . binproviders_supported )
print ( f ' [+] Locating / Installing [yellow] { binary . name } [/yellow] using [red] { providers } [/red]... ' )
2024-10-01 04:44:23 +00:00
try :
2024-10-03 10:11:23 +00:00
print ( binary . load_or_install ( ) . model_dump ( exclude = { ' binproviders_supported ' , ' loaded_binprovider ' , ' provider_overrides ' , ' loaded_abspaths ' , ' bin_dir ' , ' loaded_respath ' , ' hook_type ' } ) )
2024-10-01 04:44:23 +00:00
except Exception as e :
print ( f ' [X] Failed to install { binary . name } : { e } ' )
2021-04-05 23:51:57 +00:00
2024-09-25 08:15:15 +00:00
from django . contrib . auth import get_user_model
User = get_user_model ( )
2021-04-06 01:15:10 +00:00
2024-09-25 08:15:15 +00:00
if not User . objects . filter ( is_superuser = True ) . exists ( ) :
2024-10-01 06:21:34 +00:00
stderr ( ' \n [+] Don \' t forget to create a new admin user for the Web UI... ' , color = ' green ' )
stderr ( ' archivebox manage createsuperuser ' )
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
2024-09-25 08:15:15 +00:00
2021-04-06 01:15:10 +00:00
stderr ( ' \n [√] Set up ArchiveBox and its dependencies successfully. ' , color = ' green ' )
2024-09-25 07:42:26 +00:00
from plugins_pkg . pip . apps import ARCHIVEBOX_BINARY
2024-10-03 10:11:23 +00:00
run_shell ( [ ARCHIVEBOX_BINARY . load ( ) . abspath , ' version ' ] , capture_output = False , cwd = out_dir )
2019-04-27 21:26:24 +00:00
2024-10-01 06:21:34 +00:00
# backwards-compatibility:
setup = install
2019-05-01 03:10:48 +00:00
@enforce_types
2019-04-27 21:26:24 +00:00
def config ( config_options_str : Optional [ str ] = None ,
config_options : Optional [ List [ str ] ] = None ,
get : bool = False ,
set : bool = False ,
reset : bool = False ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Get and set your ArchiveBox project configuration values """
2019-04-27 21:26:24 +00:00
2024-10-01 01:14:43 +00:00
from rich import print
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
if config_options and config_options_str :
stderr (
' [X] You should either pass config values as an arguments '
' or via stdin, but not both. \n ' ,
color = ' red ' ,
)
raise SystemExit ( 2 )
elif config_options_str :
2019-05-01 03:13:04 +00:00
config_options = config_options_str . split ( ' \n ' )
2019-04-27 21:26:24 +00:00
2024-10-01 06:52:00 +00:00
from django . conf import settings
2019-04-27 21:26:24 +00:00
config_options = config_options or [ ]
no_args = not ( get or set or reset or config_options )
2024-09-30 22:59:05 +00:00
matching_config = { }
2019-04-27 21:26:24 +00:00
if get or no_args :
if config_options :
config_options = [ get_real_name ( key ) for key in config_options ]
2024-10-01 06:52:00 +00:00
matching_config = { key : settings . FLAT_CONFIG [ key ] for key in config_options if key in settings . FLAT_CONFIG }
failed_config = [ key for key in config_options if key not in settings . FLAT_CONFIG ]
2019-04-27 21:26:24 +00:00
if failed_config :
stderr ( )
stderr ( ' [X] These options failed to get ' , color = ' red ' )
stderr ( ' {} ' . format ( ' \n ' . join ( config_options ) ) )
raise SystemExit ( 1 )
else :
2024-10-01 06:52:00 +00:00
matching_config = settings . FLAT_CONFIG
2019-04-27 21:26:24 +00:00
print ( printable_config ( matching_config ) )
raise SystemExit ( not matching_config )
elif set :
new_config = { }
failed_options = [ ]
for line in config_options :
if line . startswith ( ' # ' ) or not line . strip ( ) :
continue
if ' = ' not in line :
stderr ( ' [X] Config KEY=VALUE must have an = sign in it ' , color = ' red ' )
stderr ( f ' { line } ' )
raise SystemExit ( 2 )
2020-11-22 17:33:15 +00:00
raw_key , val = line . split ( ' = ' , 1 )
2019-04-27 21:26:24 +00:00
raw_key = raw_key . upper ( ) . strip ( )
key = get_real_name ( raw_key )
if key != raw_key :
stderr ( f ' [i] Note: The config option { raw_key } has been renamed to { key } , please use the new name going forwards. ' , color = ' lightyellow ' )
2024-10-01 06:52:00 +00:00
if key in settings . FLAT_CONFIG :
2019-04-27 21:26:24 +00:00
new_config [ key ] = val . strip ( )
else :
failed_options . append ( line )
if new_config :
2024-10-01 06:52:00 +00:00
before = settings . FLAT_CONFIG
2024-09-30 22:59:05 +00:00
matching_config = write_config_file ( new_config , out_dir = DATA_DIR )
2019-04-27 21:26:24 +00:00
after = load_all_config ( )
print ( printable_config ( matching_config ) )
2024-09-30 22:59:05 +00:00
side_effect_changes = { }
2019-04-27 21:26:24 +00:00
for key , val in after . items ( ) :
2024-10-01 06:52:00 +00:00
if key in settings . FLAT_CONFIG and ( before [ key ] != after [ key ] ) and ( key not in matching_config ) :
2019-04-27 21:26:24 +00:00
side_effect_changes [ key ] = after [ key ]
if side_effect_changes :
stderr ( )
stderr ( ' [i] Note: This change also affected these other options that depended on it: ' , color = ' lightyellow ' )
print ( ' {} ' . format ( printable_config ( side_effect_changes , prefix = ' ' ) ) )
if failed_options :
stderr ( )
2020-06-30 06:04:16 +00:00
stderr ( ' [X] These options failed to set (check for typos): ' , color = ' red ' )
2019-04-27 21:26:24 +00:00
stderr ( ' {} ' . format ( ' \n ' . join ( failed_options ) ) )
2021-04-05 23:51:40 +00:00
raise SystemExit ( 1 )
2019-04-27 21:26:24 +00:00
elif reset :
stderr ( ' [X] This command is not implemented yet. ' , color = ' red ' )
stderr ( ' Please manually remove the relevant lines from your config file: ' )
raise SystemExit ( 2 )
else :
stderr ( ' [X] You must pass either --get or --set, or no arguments to get the whole config. ' , color = ' red ' )
stderr ( ' archivebox config ' )
stderr ( ' archivebox config --get SOME_KEY ' )
stderr ( ' archivebox config --set SOME_KEY=SOME_VALUE ' )
raise SystemExit ( 2 )
@enforce_types
def schedule ( add : bool = False ,
show : bool = False ,
clear : bool = False ,
foreground : bool = False ,
run_all : bool = False ,
quiet : bool = False ,
every : Optional [ str ] = None ,
2024-01-04 04:31:14 +00:00
tag : str = ' ' ,
2020-08-18 05:58:54 +00:00
depth : int = 0 ,
2021-03-31 15:29:51 +00:00
overwrite : bool = False ,
2024-09-30 22:59:05 +00:00
update : bool = not ARCHIVING_CONFIG . ONLY_NEW ,
2019-04-27 21:26:24 +00:00
import_path : Optional [ str ] = None ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) :
2019-05-01 03:10:48 +00:00
""" Set ArchiveBox to regularly import URLs at specific times using cron """
2019-04-27 21:26:24 +00:00
2024-10-01 06:21:34 +00:00
check_data_folder ( )
from archivebox . plugins_pkg . pip . apps import ARCHIVEBOX_BINARY
2019-04-27 21:26:24 +00:00
2024-09-25 12:10:09 +00:00
Path ( CONSTANTS . LOGS_DIR ) . mkdir ( exist_ok = True )
2019-04-27 21:26:24 +00:00
cron = CronTab ( user = True )
2019-05-01 03:13:04 +00:00
cron = dedupe_cron_jobs ( cron )
2019-04-27 21:26:24 +00:00
2020-08-18 08:39:58 +00:00
if clear :
print ( cron . remove_all ( comment = CRON_COMMENT ) )
cron . write ( )
raise SystemExit ( 0 )
2019-04-27 21:26:24 +00:00
existing_jobs = list ( cron . find_comment ( CRON_COMMENT ) )
2020-08-18 08:32:36 +00:00
if every or add :
every = every or ' day '
2021-03-31 15:29:51 +00:00
quoted = lambda s : f ' " { s } " ' if ( s and ' ' in str ( s ) ) else str ( s )
2019-04-27 21:26:24 +00:00
cmd = [
' cd ' ,
quoted ( out_dir ) ,
' && ' ,
2024-09-25 07:42:26 +00:00
quoted ( ARCHIVEBOX_BINARY . load ( ) . abspath ) ,
2021-03-31 15:29:51 +00:00
* ( [
' add ' ,
* ( [ ' --overwrite ' ] if overwrite else [ ] ) ,
2022-05-10 03:18:43 +00:00
* ( [ ' --update ' ] if update else [ ] ) ,
2024-01-04 04:31:14 +00:00
* ( [ f ' --tag= { tag } ' ] if tag else [ ] ) ,
2021-03-31 15:29:51 +00:00
f ' --depth= { depth } ' ,
f ' " { import_path } " ' ,
] if import_path else [ ' update ' ] ) ,
2021-03-31 15:20:42 +00:00
' >> ' ,
2024-09-25 12:10:09 +00:00
quoted ( Path ( CONSTANTS . LOGS_DIR ) / ' schedule.log ' ) ,
2020-08-18 08:39:58 +00:00
' 2>&1 ' ,
2019-04-27 21:26:24 +00:00
]
new_job = cron . new ( command = ' ' . join ( cmd ) , comment = CRON_COMMENT )
2020-08-18 08:32:36 +00:00
if every in ( ' minute ' , ' hour ' , ' day ' , ' month ' , ' year ' ) :
2019-04-27 21:26:24 +00:00
set_every = getattr ( new_job . every ( ) , every )
set_every ( )
elif CronSlices . is_valid ( every ) :
new_job . setall ( every )
else :
2024-09-25 12:10:09 +00:00
stderr ( ' {red} [X] Got invalid timeperiod for cron task. {reset} ' . format ( * * SHELL_CONFIG . ANSI ) )
2020-08-18 08:32:36 +00:00
stderr ( ' It must be one of minute/hour/day/month ' )
2019-04-27 21:26:24 +00:00
stderr ( ' or a quoted cron-format schedule like: ' )
2021-03-31 15:29:51 +00:00
stderr ( ' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml ' )
stderr ( ' archivebox init --every= " 0/5 * * * * " --depth=1 https://example.com/some/rss/feed.xml ' )
2019-04-27 21:26:24 +00:00
raise SystemExit ( 1 )
2019-05-01 03:13:04 +00:00
cron = dedupe_cron_jobs ( cron )
2019-04-27 21:26:24 +00:00
cron . write ( )
total_runs = sum ( j . frequency_per_year ( ) for j in cron )
existing_jobs = list ( cron . find_comment ( CRON_COMMENT ) )
print ( )
2024-09-25 12:10:09 +00:00
print ( ' {green} [√] Scheduled new ArchiveBox cron job for user: {} ( {} jobs are active). {reset} ' . format ( SHELL_CONFIG . USER , len ( existing_jobs ) , * * SHELL_CONFIG . ANSI ) )
2019-04-27 21:26:24 +00:00
print ( ' \n ' . join ( f ' > { cmd } ' if str ( cmd ) == str ( new_job ) else f ' { cmd } ' for cmd in existing_jobs ) )
if total_runs > 60 and not quiet :
stderr ( )
2024-09-25 12:10:09 +00:00
stderr ( ' {lightyellow} [!] With the current cron config, ArchiveBox is estimated to run > {} times per year. {reset} ' . format ( total_runs , * * SHELL_CONFIG . ANSI ) )
2020-07-13 15:22:07 +00:00
stderr ( ' Congrats on being an enthusiastic internet archiver! 👌 ' )
2019-04-27 21:26:24 +00:00
stderr ( )
stderr ( ' Make sure you have enough storage space available to hold all the data. ' )
stderr ( ' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot. ' )
2020-08-18 05:59:04 +00:00
stderr ( ' ' )
elif show :
if existing_jobs :
print ( ' \n ' . join ( str ( cmd ) for cmd in existing_jobs ) )
else :
2024-09-25 12:10:09 +00:00
stderr ( ' {red} [X] There are no ArchiveBox cron jobs scheduled for your user ( {} ). {reset} ' . format ( SHELL_CONFIG . USER , * * SHELL_CONFIG . ANSI ) )
2020-08-18 05:59:04 +00:00
stderr ( ' To schedule a new job, run: ' )
2021-03-31 15:29:51 +00:00
stderr ( ' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml ' )
2020-08-18 05:59:04 +00:00
raise SystemExit ( 0 )
cron = CronTab ( user = True )
cron = dedupe_cron_jobs ( cron )
existing_jobs = list ( cron . find_comment ( CRON_COMMENT ) )
if foreground or run_all :
if not existing_jobs :
2024-09-25 12:10:09 +00:00
stderr ( ' {red} [X] You must schedule some jobs first before running in foreground mode. {reset} ' . format ( * * SHELL_CONFIG . ANSI ) )
2021-03-31 15:29:51 +00:00
stderr ( ' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml ' )
2020-08-18 05:59:04 +00:00
raise SystemExit ( 1 )
2024-09-25 12:10:09 +00:00
print ( ' {green} [*] Running {} ArchiveBox jobs in foreground task scheduler... {reset} ' . format ( len ( existing_jobs ) , * * SHELL_CONFIG . ANSI ) )
2020-08-18 05:59:04 +00:00
if run_all :
try :
for job in existing_jobs :
2020-08-18 08:39:58 +00:00
sys . stdout . write ( f ' > { job . command . split ( " /archivebox " ) [ 0 ] . split ( " && " ) [ 0 ] } \n ' )
2021-03-31 15:30:15 +00:00
sys . stdout . write ( f ' > { job . command . split ( " /archivebox " ) [ - 1 ] . split ( " >> " ) [ 0 ] } ' )
2020-08-18 05:59:04 +00:00
sys . stdout . flush ( )
job . run ( )
2020-08-18 08:39:58 +00:00
sys . stdout . write ( f ' \r √ { job . command . split ( " /archivebox " ) [ - 1 ] } \n ' )
2020-08-18 05:59:04 +00:00
except KeyboardInterrupt :
2024-09-25 12:10:09 +00:00
print ( ' \n {green} [√] Stopped. {reset} ' . format ( * * SHELL_CONFIG . ANSI ) )
2020-08-18 05:59:04 +00:00
raise SystemExit ( 1 )
2019-04-27 21:26:24 +00:00
2020-08-18 05:59:04 +00:00
if foreground :
try :
for job in existing_jobs :
2021-03-31 15:30:15 +00:00
print ( f ' > { job . command . split ( " /archivebox " ) [ - 1 ] . split ( " >> " ) [ 0 ] } ' )
2020-08-18 05:59:04 +00:00
for result in cron . run_scheduler ( ) :
print ( result )
except KeyboardInterrupt :
2024-09-25 12:10:09 +00:00
print ( ' \n {green} [√] Stopped. {reset} ' . format ( * * SHELL_CONFIG . ANSI ) )
2020-08-18 05:59:04 +00:00
raise SystemExit ( 1 )
2024-09-25 07:42:26 +00:00
# if CAN_UPGRADE:
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
2023-11-28 04:58:13 +00:00
2020-08-18 05:59:04 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
def server ( runserver_args : Optional [ List [ str ] ] = None ,
reload : bool = False ,
debug : bool = False ,
2020-07-28 09:57:34 +00:00
init : bool = False ,
2021-02-15 19:52:10 +00:00
quick_init : bool = False ,
2021-01-29 03:27:02 +00:00
createsuperuser : bool = False ,
2024-10-03 02:46:31 +00:00
daemonize : bool = False ,
2024-09-30 22:59:05 +00:00
out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Run the ArchiveBox HTTP server """
2019-04-27 21:26:24 +00:00
2024-10-03 02:46:31 +00:00
from rich import print
2019-04-27 21:26:24 +00:00
runserver_args = runserver_args or [ ]
2020-07-28 03:26:45 +00:00
2021-02-16 01:49:40 +00:00
if init :
run_subcommand ( ' init ' , stdin = None , pwd = out_dir )
2021-02-16 07:49:31 +00:00
print ( )
elif quick_init :
2021-02-16 01:49:40 +00:00
run_subcommand ( ' init ' , subcommand_args = [ ' --quick ' ] , stdin = None , pwd = out_dir )
2021-02-16 07:49:31 +00:00
print ( )
2020-07-28 09:57:34 +00:00
2021-01-29 03:27:02 +00:00
if createsuperuser :
run_subcommand ( ' manage ' , subcommand_args = [ ' createsuperuser ' ] , pwd = out_dir )
2021-02-16 07:49:31 +00:00
print ( )
2021-01-29 03:27:02 +00:00
2019-05-01 03:11:41 +00:00
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2020-07-28 09:57:34 +00:00
2019-04-27 21:26:24 +00:00
from django . core . management import call_command
from django . contrib . auth . models import User
2024-09-27 07:41:21 +00:00
2019-04-27 21:26:24 +00:00
2024-10-03 02:46:31 +00:00
print ( ' [green][+] Starting ArchiveBox webserver...[/green] ' )
2021-02-16 07:49:31 +00:00
print ( ' > Logging errors to ./logs/errors.log ' )
if not User . objects . filter ( is_superuser = True ) . exists ( ) :
2024-10-03 02:46:31 +00:00
print ( ' [yellow][!] No admin users exist yet, you will not be able to edit links in the UI.[/yellow] ' )
2019-04-27 21:26:24 +00:00
print ( )
2024-10-03 02:46:31 +00:00
print ( ' [violet]Hint:[/violet] To create an admin user, run: ' )
2019-04-27 21:26:24 +00:00
print ( ' archivebox manage createsuperuser ' )
print ( )
2020-07-28 09:57:34 +00:00
2024-09-30 22:59:05 +00:00
if SHELL_CONFIG . DEBUG :
2024-09-22 20:17:45 +00:00
if not reload :
runserver_args . append ( ' --noreload ' ) # '--insecure'
2024-09-06 09:55:06 +00:00
call_command ( " runserver " , * runserver_args )
else :
host = ' 127.0.0.1 '
port = ' 8000 '
try :
host_and_port = [ arg for arg in runserver_args if arg . replace ( ' . ' , ' ' ) . replace ( ' : ' , ' ' ) . isdigit ( ) ] [ 0 ]
if ' : ' in host_and_port :
host , port = host_and_port . split ( ' : ' )
else :
if ' . ' in host_and_port :
host = host_and_port
else :
port = host_and_port
except IndexError :
pass
2024-10-05 04:09:29 +00:00
print ( f ' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http:// { host } : { port } ]http:// { host } : { port } [/link][/deep_sky_blue4] ' )
2024-09-22 20:17:45 +00:00
2024-09-25 05:22:03 +00:00
from queues . supervisor_util import start_server_workers
2024-09-10 07:04:39 +00:00
print ( )
2024-09-22 20:17:45 +00:00
2024-10-03 02:46:31 +00:00
start_server_workers ( host = host , port = port , daemonize = False )
2024-09-10 07:04:39 +00:00
2024-10-03 02:46:31 +00:00
print ( " \n [i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i] " )
2019-04-27 21:26:24 +00:00
2019-05-01 03:10:48 +00:00
@enforce_types
2024-09-30 22:59:05 +00:00
def manage ( args : Optional [ List [ str ] ] = None , out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:10:48 +00:00
""" Run an ArchiveBox Django management command """
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
from django . core . management import execute_from_command_line
2024-09-30 22:59:05 +00:00
if ( args and " createsuperuser " in args ) and ( SHELL_CONFIG . IN_DOCKER and not SHELL_CONFIG . IS_TTY ) :
2020-08-10 18:15:53 +00:00
stderr ( ' [!] Warning: you need to pass -it to use interactive commands in docker ' , color = ' lightyellow ' )
stderr ( ' docker run -it archivebox manage {} ' . format ( ' ' . join ( args or [ ' ... ' ] ) ) , color = ' lightyellow ' )
2024-05-06 13:58:03 +00:00
stderr ( ' ' )
2024-09-26 09:37:44 +00:00
# import ipdb; ipdb.set_trace()
2020-08-10 18:15:53 +00:00
2024-09-26 09:37:44 +00:00
execute_from_command_line ( [ ' manage.py ' , * ( args or [ ' help ' ] ) ] )
2019-04-27 21:26:24 +00:00
2019-05-01 03:13:04 +00:00
@enforce_types
2024-09-30 22:59:05 +00:00
def shell ( out_dir : Path = DATA_DIR ) - > None :
2019-05-01 03:13:04 +00:00
""" Enter an interactive ArchiveBox Django shell """
2024-10-01 06:21:34 +00:00
check_data_folder ( )
2019-04-27 21:26:24 +00:00
from django . core . management import call_command
call_command ( " shell_plus " )
2020-07-29 16:19:06 +00:00