2020-12-20 01:11:19 +00:00
"""
ArchiveBox config definitons ( including defaults and dynamic config options ) .
Config Usage Example :
archivebox config - - set MEDIA_TIMEOUT = 600
env MEDIA_TIMEOUT = 600 USE_COLOR = False . . . archivebox [ subcommand ] . . .
Config Precedence Order :
1. cli args ( - - update - all / - - index - only / etc . )
2. shell environment vars ( env USE_COLOR = False archivebox add ' ... ' )
3. config file ( echo " SAVE_FAVICON=False " >> ArchiveBox . conf )
4. defaults ( defined below in Python )
Documentation :
https : / / github . com / ArchiveBox / ArchiveBox / wiki / Configuration
"""
2024-09-30 22:59:05 +00:00
__package__ = ' archivebox.config '
2019-04-17 07:49:18 +00:00
2017-07-04 10:38:07 +00:00
import os
2019-03-23 02:05:45 +00:00
import re
2017-07-04 10:48:12 +00:00
import sys
2020-08-18 19:00:00 +00:00
import json
2017-07-05 21:33:51 +00:00
import shutil
2017-07-04 10:48:12 +00:00
2020-07-28 11:20:57 +00:00
from pathlib import Path
2021-04-10 08:19:30 +00:00
from datetime import datetime , timezone
2024-10-08 06:45:11 +00:00
from typing import Optional , Type , Tuple , Dict , Any
from subprocess import run , DEVNULL
2019-04-25 23:03:38 +00:00
from configparser import ConfigParser
2017-07-04 10:38:07 +00:00
2024-09-25 02:04:38 +00:00
from rich . progress import Progress
2024-09-25 02:37:29 +00:00
from rich . console import Console
2024-09-26 09:43:12 +00:00
from benedict import benedict
2024-08-23 09:01:24 +00:00
import django
from django . db . backends . sqlite3 . base import Database as sqlite3
2024-09-26 09:43:12 +00:00
2024-10-08 06:45:11 +00:00
from . constants import CONSTANTS
2024-09-30 22:59:05 +00:00
from . constants import *
2024-10-08 06:45:11 +00:00
2024-09-30 22:59:05 +00:00
from . . misc . logging import (
2024-09-25 02:04:38 +00:00
stderr ,
2024-09-25 12:10:09 +00:00
hint , # noqa
2024-09-25 02:04:38 +00:00
)
2024-10-08 06:45:11 +00:00
from . common import SHELL_CONFIG , GENERAL_CONFIG , ARCHIVING_CONFIG , SERVER_CONFIG , SEARCH_BACKEND_CONFIG , STORAGE_CONFIG
2024-10-01 00:44:18 +00:00
from archivebox . plugins_auth . ldap . apps import LDAP_CONFIG
from archivebox . plugins_extractor . favicon . apps import FAVICON_CONFIG
2024-10-01 04:43:45 +00:00
from archivebox . plugins_extractor . wget . apps import WGET_CONFIG
2024-10-01 06:21:34 +00:00
from archivebox . plugins_extractor . curl . apps import CURL_CONFIG
2024-10-01 00:44:18 +00:00
2024-09-26 09:43:12 +00:00
ANSI = SHELL_CONFIG . ANSI
LDAP = LDAP_CONFIG . LDAP_ENABLED
2022-06-09 03:04:01 +00:00
2020-12-20 01:11:19 +00:00
############################### Config Schema ##################################
2019-04-24 08:09:25 +00:00
2024-10-08 06:45:11 +00:00
CONFIG_SCHEMA : Dict [ str , Dict [ str , Any ] ] = {
2024-09-26 09:43:12 +00:00
' SHELL_CONFIG ' : SHELL_CONFIG . as_legacy_config_schema ( ) ,
2019-04-25 23:03:38 +00:00
2024-09-26 09:43:12 +00:00
' SERVER_CONFIG ' : SERVER_CONFIG . as_legacy_config_schema ( ) ,
' GENERAL_CONFIG ' : GENERAL_CONFIG . as_legacy_config_schema ( ) ,
2023-10-20 11:25:44 +00:00
2024-09-26 09:43:12 +00:00
' ARCHIVING_CONFIG ' : ARCHIVING_CONFIG . as_legacy_config_schema ( ) ,
2023-10-20 11:25:44 +00:00
2024-09-26 09:43:12 +00:00
' SEARCH_BACKEND_CONFIG ' : SEARCH_BACKEND_CONFIG . as_legacy_config_schema ( ) ,
2023-10-20 11:25:44 +00:00
2024-09-26 09:43:12 +00:00
' STORAGE_CONFIG ' : STORAGE_CONFIG . as_legacy_config_schema ( ) ,
' LDAP_CONFIG ' : LDAP_CONFIG . as_legacy_config_schema ( ) ,
2024-10-01 06:21:34 +00:00
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
2024-10-01 04:43:45 +00:00
2024-10-01 06:21:34 +00:00
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
# 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(),
2019-04-25 23:03:38 +00:00
2019-05-02 23:15:16 +00:00
2019-04-25 23:03:38 +00:00
' ARCHIVE_METHOD_TOGGLES ' : {
' SAVE_TITLE ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_TITLE ' , ) } ,
' SAVE_FAVICON ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_FAVICON ' , ) } ,
' SAVE_WGET ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WGET ' , ) } ,
' SAVE_WGET_REQUISITES ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WGET_REQUISITES ' , ) } ,
2020-07-30 18:23:10 +00:00
' SAVE_SINGLEFILE ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_SINGLEFILE ' , ) } ,
2020-08-07 13:05:17 +00:00
' SAVE_READABILITY ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_READABILITY ' , ) } ,
2020-09-22 08:46:21 +00:00
' SAVE_MERCURY ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_MERCURY ' , ) } ,
2023-10-24 01:42:25 +00:00
' SAVE_HTMLTOTEXT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_HTMLTOTEXT ' , ) } ,
2019-04-25 23:03:38 +00:00
' SAVE_PDF ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_PDF ' , ) } ,
' SAVE_SCREENSHOT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_SCREENSHOT ' , ) } ,
' SAVE_DOM ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_DOM ' , ) } ,
2020-09-24 13:37:27 +00:00
' SAVE_HEADERS ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_HEADERS ' , ) } ,
2019-04-25 23:03:38 +00:00
' SAVE_WARC ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WARC ' , ) } ,
' SAVE_GIT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_GIT ' , ) } ,
' SAVE_MEDIA ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_MEDIA ' , ) } ,
' SAVE_ARCHIVE_DOT_ORG ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' SUBMIT_ARCHIVE_DOT_ORG ' , ) } ,
2023-07-31 15:34:03 +00:00
' SAVE_ALLOWLIST ' : { ' type ' : dict , ' default ' : { } , } ,
' SAVE_DENYLIST ' : { ' type ' : dict , ' default ' : { } , } ,
2019-04-25 23:03:38 +00:00
} ,
' ARCHIVE_METHOD_OPTIONS ' : {
2024-01-23 21:51:56 +00:00
' RESOLUTION ' : { ' type ' : str , ' default ' : ' 1440,2000 ' , ' aliases ' : ( ' SCREENSHOT_RESOLUTION ' , ' WINDOW_SIZE ' ) } ,
2024-10-01 06:21:34 +00:00
# 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
2019-04-25 23:03:38 +00:00
' CHECK_SSL_VALIDITY ' : { ' type ' : bool , ' default ' : True } ,
2021-02-16 06:21:19 +00:00
' MEDIA_MAX_SIZE ' : { ' type ' : str , ' default ' : ' 750m ' } ,
2019-04-25 23:03:38 +00:00
2024-09-02 08:13:50 +00:00
' USER_AGENT ' : { ' type ' : str , ' default ' : ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/ {VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) ' } ,
2024-09-25 12:10:09 +00:00
' CURL_USER_AGENT ' : { ' type ' : str , ' default ' : lambda c : c [ ' USER_AGENT ' ] } , # + ' curl/{CURL_VERSION}'},
2019-04-25 23:03:38 +00:00
' COOKIES_FILE ' : { ' type ' : str , ' default ' : None } ,
2024-02-24 00:40:03 +00:00
2021-04-10 09:45:02 +00:00
' YOUTUBEDL_ARGS ' : { ' type ' : list , ' default ' : lambda c : [
2024-01-23 22:19:59 +00:00
' --restrict-filenames ' ,
2024-01-31 09:59:43 +00:00
' --trim-filenames ' , ' 128 ' ,
2021-04-10 09:45:02 +00:00
' --write-description ' ,
2020-10-14 15:38:29 +00:00
' --write-info-json ' ,
' --write-annotations ' ,
' --write-thumbnail ' ,
' --no-call-home ' ,
2021-04-10 10:56:19 +00:00
' --write-sub ' ,
2023-08-31 22:17:45 +00:00
' --write-auto-subs ' ,
2021-04-10 11:22:20 +00:00
' --convert-subs=srt ' ,
2021-01-29 05:15:15 +00:00
' --yes-playlist ' ,
' --continue ' ,
2022-09-12 20:36:23 +00:00
# This flag doesn't exist in youtube-dl
# only in yt-dlp
2022-04-21 14:09:17 +00:00
' --no-abort-on-error ' ,
2022-09-14 04:27:58 +00:00
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
2020-10-14 15:38:29 +00:00
' --ignore-errors ' ,
' --geo-bypass ' ,
2021-01-29 05:15:15 +00:00
' --add-metadata ' ,
2023-08-31 22:17:45 +00:00
' --format=(bv*+ba/b)[filesize<= {} ][filesize_approx<=? {} ]/(bv*+ba/b) ' . format ( c [ ' MEDIA_MAX_SIZE ' ] , c [ ' MEDIA_MAX_SIZE ' ] ) ,
2021-01-29 05:15:15 +00:00
] } ,
2024-02-23 21:40:31 +00:00
' YOUTUBEDL_EXTRA_ARGS ' : { ' type ' : list , ' default ' : None } ,
2021-09-30 15:40:13 +00:00
2020-11-19 13:06:13 +00:00
} ,
2019-04-25 23:03:38 +00:00
' DEPENDENCY_CONFIG ' : {
' USE_CURL ' : { ' type ' : bool , ' default ' : True } ,
2020-08-18 12:29:46 +00:00
' USE_SINGLEFILE ' : { ' type ' : bool , ' default ' : True } ,
' USE_READABILITY ' : { ' type ' : bool , ' default ' : True } ,
2019-04-25 23:03:38 +00:00
' USE_GIT ' : { ' type ' : bool , ' default ' : True } ,
' USE_CHROME ' : { ' type ' : bool , ' default ' : True } ,
' USE_YOUTUBEDL ' : { ' type ' : bool , ' default ' : True } ,
2020-12-12 12:36:31 +00:00
' USE_RIPGREP ' : { ' type ' : bool , ' default ' : True } ,
2021-09-30 15:40:13 +00:00
2024-10-01 06:21:34 +00:00
# 'GIT_BINARY': {'type': str, 'default': 'git'},
# 'CURL_BINARY': {'type': str, 'default': 'curl'},
# 'NODE_BINARY': {'type': str, 'default': 'node'},
2024-09-25 12:10:09 +00:00
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
# 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
2020-11-07 19:17:21 +00:00
' POCKET_CONSUMER_KEY ' : { ' type ' : str , ' default ' : None } ,
' POCKET_ACCESS_TOKENS ' : { ' type ' : dict , ' default ' : { } } ,
2023-07-02 15:18:41 +00:00
2024-01-04 04:08:39 +00:00
' READWISE_READER_TOKENS ' : { ' type ' : dict , ' default ' : { } } ,
2019-04-25 23:03:38 +00:00
} ,
2019-04-24 08:09:25 +00:00
}
2020-12-20 01:11:19 +00:00
########################## Backwards-Compatibility #############################
2020-10-31 07:08:03 +00:00
# for backwards compatibility with old config files, check old/deprecated names for each key
2019-04-26 18:43:13 +00:00
CONFIG_ALIASES = {
alias : key
2020-12-20 01:11:19 +00:00
for section in CONFIG_SCHEMA . values ( )
2019-04-26 18:43:13 +00:00
for key , default in section . items ( )
for alias in default . get ( ' aliases ' , ( ) )
}
2024-05-06 18:06:42 +00:00
USER_CONFIG = { key : section [ key ] for section in CONFIG_SCHEMA . values ( ) for key in section . keys ( ) }
2020-10-31 07:08:03 +00:00
2019-04-26 18:43:13 +00:00
def get_real_name ( key : str ) - > str :
2020-12-20 01:11:19 +00:00
""" get the current canonical name for a given deprecated config key """
2019-04-26 18:43:13 +00:00
return CONFIG_ALIASES . get ( key . upper ( ) . strip ( ) , key . upper ( ) . strip ( ) )
2019-04-24 08:09:25 +00:00
2020-12-20 01:11:19 +00:00
2023-12-19 06:04:11 +00:00
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
2024-10-08 06:45:11 +00:00
DYNAMIC_CONFIG_SCHEMA : Dict [ str , Any ] = {
2024-09-26 09:43:12 +00:00
' URL_DENYLIST_PTN ' : { ' default ' : lambda c : c [ ' URL_DENYLIST ' ] and re . compile ( c [ ' URL_DENYLIST ' ] or ' ' , CONSTANTS . ALLOWDENYLIST_REGEX_FLAGS ) } ,
' URL_ALLOWLIST_PTN ' : { ' default ' : lambda c : c [ ' URL_ALLOWLIST ' ] and re . compile ( c [ ' URL_ALLOWLIST ' ] or ' ' , CONSTANTS . ALLOWDENYLIST_REGEX_FLAGS ) } ,
2019-04-24 08:09:25 +00:00
2024-09-30 22:59:05 +00:00
' SAVE_ALLOWLIST_PTN ' : { ' default ' : lambda c : c [ ' SAVE_ALLOWLIST ' ] and { re . compile ( k , CONSTANTS . ALLOWDENYLIST_REGEX_FLAGS ) : v for k , v in c [ ' SAVE_ALLOWLIST ' ] . items ( ) } } ,
' SAVE_DENYLIST_PTN ' : { ' default ' : lambda c : c [ ' SAVE_DENYLIST ' ] and { re . compile ( k , CONSTANTS . ALLOWDENYLIST_REGEX_FLAGS ) : v for k , v in c [ ' SAVE_DENYLIST ' ] . items ( ) } } ,
2019-04-24 08:09:25 +00:00
}
2019-03-27 20:44:00 +00:00
2019-04-24 08:09:25 +00:00
2024-09-21 08:53:59 +00:00
# print("FINISHED DEFINING SCHEMAS")
2019-04-24 08:09:25 +00:00
################################### Helpers ####################################
2020-12-20 01:11:19 +00:00
2019-04-24 15:36:14 +00:00
def load_config_val ( key : str ,
2024-10-08 06:45:11 +00:00
default : Any = None ,
2019-04-24 15:36:14 +00:00
type : Optional [ Type ] = None ,
aliases : Optional [ Tuple [ str , . . . ] ] = None ,
2024-09-30 22:59:05 +00:00
config : Optional [ benedict ] = None ,
2019-04-25 23:03:38 +00:00
env_vars : Optional [ os . _Environ ] = None ,
2024-10-08 06:45:11 +00:00
config_file_vars : Optional [ Dict [ str , str ] ] = None ) - > Any :
2020-07-13 15:22:07 +00:00
""" parse bool, int, and str key=value pairs from env """
2023-12-19 06:04:11 +00:00
assert isinstance ( config , dict )
2019-04-25 23:03:38 +00:00
2023-12-19 06:04:11 +00:00
is_read_only = type is None
if is_read_only :
if callable ( default ) :
return default ( config )
return default
# get value from environment variables or config files
2019-04-25 23:03:38 +00:00
config_keys_to_check = ( key , * ( aliases or ( ) ) )
2023-12-19 06:04:11 +00:00
val = None
2019-04-25 23:03:38 +00:00
for key in config_keys_to_check :
if env_vars :
val = env_vars . get ( key )
if val :
break
2023-12-19 06:04:11 +00:00
2019-04-25 23:03:38 +00:00
if config_file_vars :
val = config_file_vars . get ( key )
if val :
break
2019-04-24 08:09:25 +00:00
2023-12-19 06:04:11 +00:00
is_unset = val is None
if is_unset :
2019-04-24 15:36:14 +00:00
if callable ( default ) :
2019-04-24 08:09:25 +00:00
return default ( config )
return default
2020-06-30 05:08:14 +00:00
2023-12-19 06:04:11 +00:00
# calculate value based on expected type
BOOL_TRUEIES = ( ' true ' , ' yes ' , ' 1 ' )
BOOL_FALSEIES = ( ' false ' , ' no ' , ' 0 ' )
if type is bool :
if val . lower ( ) in BOOL_TRUEIES :
2019-04-24 08:09:25 +00:00
return True
2023-12-19 06:04:11 +00:00
elif val . lower ( ) in BOOL_FALSEIES :
2019-04-24 08:09:25 +00:00
return False
else :
2021-09-30 15:40:13 +00:00
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected a boolean: True/False) ' )
2019-04-24 08:09:25 +00:00
elif type is str :
2023-12-19 06:04:11 +00:00
if val . lower ( ) in ( * BOOL_TRUEIES , * BOOL_FALSEIES ) :
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected a string, but value looks like a boolean) ' )
2019-04-24 08:09:25 +00:00
return val . strip ( )
elif type is int :
2023-12-19 06:04:11 +00:00
if not val . strip ( ) . isdigit ( ) :
2019-04-24 08:09:25 +00:00
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected an integer) ' )
2023-12-19 06:04:11 +00:00
return int ( val . strip ( ) )
2019-04-24 08:09:25 +00:00
2020-11-07 19:17:21 +00:00
elif type is list or type is dict :
2020-10-15 13:16:08 +00:00
return json . loads ( val )
2020-10-14 15:38:29 +00:00
2023-12-19 06:04:11 +00:00
raise Exception ( ' Config values can only be str, bool, int, or json ' )
2019-04-24 08:09:25 +00:00
2020-06-30 05:08:14 +00:00
2024-09-30 22:59:05 +00:00
def load_config_file ( out_dir : str | None = CONSTANTS . DATA_DIR ) - > Optional [ benedict ] :
2024-10-01 00:44:18 +00:00
""" load the ini-formatted config file from DATA_DIR/Archivebox.conf """
2019-04-25 23:03:38 +00:00
2024-09-30 22:59:05 +00:00
config_path = CONSTANTS . CONFIG_FILE
2020-09-07 22:49:14 +00:00
if config_path . exists ( ) :
2019-04-25 23:03:38 +00:00
config_file = ConfigParser ( )
2021-09-30 15:40:13 +00:00
config_file . optionxform = str
2019-04-25 23:03:38 +00:00
config_file . read ( config_path )
# flatten into one namespace
2024-09-30 22:59:05 +00:00
config_file_vars = benedict ( {
2019-04-25 23:03:38 +00:00
key . upper ( ) : val
for section , options in config_file . items ( )
for key , val in options . items ( )
2024-08-21 01:31:21 +00:00
} )
2019-04-25 23:03:38 +00:00
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None
2020-06-30 05:12:06 +00:00
2024-09-30 22:59:05 +00:00
def write_config_file ( config : Dict [ str , str ] , out_dir : str | None = CONSTANTS . DATA_DIR ) - > benedict :
2024-10-01 00:44:18 +00:00
""" load the ini-formatted config file from DATA_DIR/Archivebox.conf """
2019-04-25 23:03:38 +00:00
2024-10-01 00:13:55 +00:00
from archivebox . misc . system import atomic_write
2020-06-30 06:04:16 +00:00
2020-12-20 01:11:19 +00:00
CONFIG_HEADER = (
""" # This is the config file for your ArchiveBox collection.
#
# You can add options here manually in INI format, or automatically by running:
# archivebox config --set KEY=VALUE
2021-09-30 15:40:13 +00:00
#
2020-12-20 01:11:19 +00:00
# If you modify this file manually, make sure to update your archive after by running:
# archivebox init
#
# A list of all possible config with documentation and examples can be found here:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
""" )
2024-09-30 22:59:05 +00:00
config_path = CONSTANTS . CONFIG_FILE
2021-09-30 15:40:13 +00:00
2020-09-07 22:49:14 +00:00
if not config_path . exists ( ) :
2020-06-30 05:12:06 +00:00
atomic_write ( config_path , CONFIG_HEADER )
2019-04-25 23:03:38 +00:00
config_file = ConfigParser ( )
2019-04-26 18:43:13 +00:00
config_file . optionxform = str
2019-04-25 23:03:38 +00:00
config_file . read ( config_path )
2021-03-27 05:01:29 +00:00
with open ( config_path , ' r ' , encoding = ' utf-8 ' ) as old :
2020-06-30 05:12:06 +00:00
atomic_write ( f ' { config_path } .bak ' , old . read ( ) )
2020-12-20 01:11:19 +00:00
find_section = lambda key : [ name for name , opts in CONFIG_SCHEMA . items ( ) if key in opts ] [ 0 ]
2019-04-25 23:03:38 +00:00
2020-06-30 05:12:06 +00:00
# Set up sections in empty config file
for key , val in config . items ( ) :
section = find_section ( key )
if section in config_file :
existing_config = dict ( config_file [ section ] )
else :
existing_config = { }
2024-09-30 22:59:05 +00:00
config_file [ section ] = benedict ( { * * existing_config , key : val } )
2020-06-30 05:12:06 +00:00
# always make sure there's a SECRET_KEY defined for Django
existing_secret_key = None
if ' SERVER_CONFIG ' in config_file and ' SECRET_KEY ' in config_file [ ' SERVER_CONFIG ' ] :
existing_secret_key = config_file [ ' SERVER_CONFIG ' ] [ ' SECRET_KEY ' ]
if ( not existing_secret_key ) or ( ' not a valid secret ' in existing_secret_key ) :
from django . utils . crypto import get_random_string
2021-02-16 01:45:42 +00:00
chars = ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_ '
2020-06-30 05:12:06 +00:00
random_secret_key = get_random_string ( 50 , chars )
if ' SERVER_CONFIG ' in config_file :
config_file [ ' SERVER_CONFIG ' ] [ ' SECRET_KEY ' ] = random_secret_key
else :
config_file [ ' SERVER_CONFIG ' ] = { ' SECRET_KEY ' : random_secret_key }
2021-03-27 05:01:29 +00:00
with open ( config_path , ' w+ ' , encoding = ' utf-8 ' ) as new :
2020-06-30 06:04:16 +00:00
config_file . write ( new )
2021-09-30 15:40:13 +00:00
2019-04-25 23:03:38 +00:00
try :
2020-06-30 05:12:06 +00:00
# validate the config by attempting to re-parse it
2019-04-25 23:03:38 +00:00
CONFIG = load_all_config ( )
2021-04-08 10:08:17 +00:00
except BaseException : # lgtm [py/catch-base-exception]
2020-06-30 05:12:06 +00:00
# something went horribly wrong, rever to the previous version
2021-03-27 05:01:29 +00:00
with open ( f ' { config_path } .bak ' , ' r ' , encoding = ' utf-8 ' ) as old :
2020-06-30 05:12:06 +00:00
atomic_write ( config_path , old . read ( ) )
2019-04-25 23:03:38 +00:00
2021-04-08 10:08:17 +00:00
raise
2020-09-30 19:54:51 +00:00
if Path ( f ' { config_path } .bak ' ) . exists ( ) :
2020-06-30 05:12:06 +00:00
os . remove ( f ' { config_path } .bak ' )
2021-09-30 15:40:13 +00:00
2024-09-30 22:59:05 +00:00
return benedict ( {
2021-04-09 16:15:47 +00:00
key . upper ( ) : CONFIG . get ( key . upper ( ) )
for key in config . keys ( )
2024-09-30 22:59:05 +00:00
} )
2019-04-25 23:03:38 +00:00
2021-09-30 15:40:13 +00:00
2019-04-25 23:03:38 +00:00
2024-10-08 06:45:11 +00:00
def load_config ( defaults : Dict [ str , Any ] ,
2024-09-30 22:59:05 +00:00
config : Optional [ benedict ] = None ,
2019-04-25 23:03:38 +00:00
out_dir : Optional [ str ] = None ,
env_vars : Optional [ os . _Environ ] = None ,
2024-09-26 09:43:12 +00:00
config_file_vars : Optional [ Dict [ str , str ] ] = None ) - > benedict :
2021-09-30 15:40:13 +00:00
2019-04-25 23:03:38 +00:00
env_vars = env_vars or os . environ
config_file_vars = config_file_vars or load_config_file ( out_dir = out_dir )
2024-09-30 22:59:05 +00:00
extended_config = benedict ( config . copy ( ) if config else { } )
2019-04-24 08:09:25 +00:00
for key , default in defaults . items ( ) :
try :
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG KEY:', key, 'DEFAULT=', default)
2019-04-24 15:36:14 +00:00
extended_config [ key ] = load_config_val (
key ,
default = default [ ' default ' ] ,
type = default . get ( ' type ' ) ,
aliases = default . get ( ' aliases ' ) ,
config = extended_config ,
2019-04-25 23:03:38 +00:00
env_vars = env_vars ,
config_file_vars = config_file_vars ,
2019-04-24 15:36:14 +00:00
)
2019-04-24 08:09:25 +00:00
except KeyboardInterrupt :
2019-04-25 23:02:44 +00:00
raise SystemExit ( 0 )
2019-04-24 08:09:25 +00:00
except Exception as e :
stderr ( )
2019-04-24 15:36:14 +00:00
stderr ( f ' [X] Error while loading configuration value: { key } ' , color = ' red ' , config = extended_config )
2019-04-24 08:09:25 +00:00
stderr ( ' {} : {} ' . format ( e . __class__ . __name__ , e ) )
stderr ( )
stderr ( ' Check your config for mistakes and try again (your archive data is unaffected). ' )
stderr ( )
stderr ( ' For config documentation and examples see: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration ' )
2019-04-24 08:09:25 +00:00
stderr ( )
2021-04-06 17:47:26 +00:00
# raise
2024-09-25 12:10:09 +00:00
# raise SystemExit(2)
2021-09-30 15:40:13 +00:00
2024-09-26 09:43:12 +00:00
return benedict ( extended_config )
2023-11-28 04:56:30 +00:00
2020-12-20 01:11:19 +00:00
2019-03-26 09:31:27 +00:00
2019-04-11 07:42:35 +00:00
def find_chrome_binary ( ) - > Optional [ str ] :
2019-03-26 09:31:27 +00:00
""" find any installed chrome binaries in the default locations """
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
2019-03-30 19:03:31 +00:00
# make sure data dir finding precedence order always matches binary finding order
2019-03-26 09:31:27 +00:00
default_executable_paths = (
2023-11-14 10:04:49 +00:00
# '~/Library/Caches/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
2019-03-26 09:31:27 +00:00
' chromium-browser ' ,
' chromium ' ,
' /Applications/Chromium.app/Contents/MacOS/Chromium ' ,
2020-06-26 01:30:29 +00:00
' chrome ' ,
2019-03-26 09:31:27 +00:00
' google-chrome ' ,
' /Applications/Google Chrome.app/Contents/MacOS/Google Chrome ' ,
' google-chrome-stable ' ,
' google-chrome-beta ' ,
' google-chrome-canary ' ,
' /Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary ' ,
' google-chrome-unstable ' ,
' google-chrome-dev ' ,
)
for name in default_executable_paths :
full_path_exists = shutil . which ( name )
if full_path_exists :
return name
2021-09-30 15:40:13 +00:00
2019-04-11 07:42:35 +00:00
return None
2019-03-26 09:31:27 +00:00
def find_chrome_data_dir ( ) - > Optional [ str ] :
""" find any installed chrome user data directories in the default locations """
2024-03-14 07:58:45 +00:00
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
# Going forward we want to discourage people from using their main chrome profile for archiving.
# Session tokens, personal data, and cookies are often returned in server responses,
# when they get archived, they are essentially burned as anyone who can view the archive
# can use that data to masquerade as the logged-in user that did the archiving.
# For this reason users should always create dedicated burner profiles for archiving and not use
# their daily driver main accounts.
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# # make sure data dir finding precedence order always matches binary finding order
# default_profile_paths = (
# '~/.config/chromium',
# '~/Library/Application Support/Chromium',
# '~/AppData/Local/Chromium/User Data',
# '~/.config/chrome',
# '~/.config/google-chrome',
# '~/Library/Application Support/Google/Chrome',
# '~/AppData/Local/Google/Chrome/User Data',
# '~/.config/google-chrome-stable',
# '~/.config/google-chrome-beta',
# '~/Library/Application Support/Google/Chrome Canary',
# '~/AppData/Local/Google/Chrome SxS/User Data',
# '~/.config/google-chrome-unstable',
# '~/.config/google-chrome-dev',
# )
# for path in default_profile_paths:
# full_path = Path(path).resolve()
# if full_path.exists():
# return full_path
2019-03-26 09:31:27 +00:00
return None
2019-04-24 08:09:25 +00:00
def wget_supports_compression ( config ) :
2020-11-12 19:28:43 +00:00
try :
cmd = [
config [ ' WGET_BINARY ' ] ,
" --compression=auto " ,
" --help " ,
]
return not run ( cmd , stdout = DEVNULL , stderr = DEVNULL ) . returncode
except ( FileNotFoundError , OSError ) :
return False
2019-03-26 23:21:34 +00:00
2019-04-11 07:42:35 +00:00
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************** Load Config *********************************
# ******* (compile the defaults, configs, and metadata all into CONFIG) ********
# ******************************************************************************
# ******************************************************************************
2019-04-24 08:09:25 +00:00
2019-04-25 23:00:25 +00:00
def load_all_config ( ) :
2024-09-30 22:59:05 +00:00
CONFIG = benedict ( )
2020-12-20 01:11:19 +00:00
for section_name , section_config in CONFIG_SCHEMA . items ( ) :
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG SECTION:', section_name)
2019-04-25 23:00:25 +00:00
CONFIG = load_config ( section_config , CONFIG )
2024-09-21 08:53:59 +00:00
# print("LOADING CONFIG SECTION:", 'DYNAMIC')
2020-12-20 01:11:19 +00:00
return load_config ( DYNAMIC_CONFIG_SCHEMA , CONFIG )
2019-04-25 23:00:25 +00:00
2020-12-20 01:11:19 +00:00
# add all final config values in CONFIG to globals in this file
2024-09-30 22:59:05 +00:00
CONFIG : benedict = load_all_config ( )
2019-04-24 08:09:25 +00:00
globals ( ) . update ( CONFIG )
2024-10-08 06:45:11 +00:00
2019-04-11 07:42:35 +00:00
2024-09-21 08:53:59 +00:00
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
########################### System Environment Setup ###########################
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
2024-10-08 06:45:11 +00:00
assert CONSTANTS . TIMEZONE == ' UTC ' , f ' The server timezone should always be set to UTC (got { CONSTANTS . TIMEZONE } ) ' # noqa: F821
os . environ [ " TZ " ] = CONSTANTS . TIMEZONE # noqa: F821
2024-09-26 09:43:12 +00:00
os . umask ( 0o777 - int ( STORAGE_CONFIG . DIR_OUTPUT_PERMISSIONS , base = 8 ) ) # noqa: F821
2020-08-11 03:21:02 +00:00
2020-12-20 01:11:19 +00:00
########################### Config Validity Checkers ###########################
2024-09-26 09:43:12 +00:00
if not SHELL_CONFIG . USE_COLOR :
2024-09-25 02:37:29 +00:00
os . environ [ ' NO_COLOR ' ] = ' 1 '
2024-09-26 09:43:12 +00:00
if not SHELL_CONFIG . SHOW_PROGRESS :
2024-09-25 02:37:29 +00:00
os . environ [ ' TERM ' ] = ' dumb '
# recreate rich console obj based on new config values
2024-10-08 06:45:11 +00:00
STDOUT = CONSOLE = Console ( )
STDERR = Console ( stderr = True )
2024-09-30 22:59:05 +00:00
from . . misc import logging
2024-09-25 02:37:29 +00:00
logging . CONSOLE = CONSOLE
2024-09-25 02:04:38 +00:00
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
2019-04-11 07:42:35 +00:00
2024-10-08 06:45:11 +00:00
def bump_startup_progress_bar ( advance = 1 ) :
2024-09-25 02:04:38 +00:00
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS :
2024-10-08 06:45:11 +00:00
INITIAL_STARTUP_PROGRESS . update ( INITIAL_STARTUP_PROGRESS_TASK , advance = advance ) # type: ignore
2021-09-30 15:40:13 +00:00
2024-09-25 07:42:26 +00:00
def setup_django_minimal ( ) :
2024-09-30 22:59:05 +00:00
# sys.path.append(str(CONSTANTS.PACKAGE_DIR))
2024-10-01 00:44:18 +00:00
# os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
2024-09-27 07:41:21 +00:00
# os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
# django.setup()
raise Exception ( ' dont use this anymore ' )
DJANGO_SET_UP = False
2024-09-25 07:42:26 +00:00
2024-09-30 22:59:05 +00:00
def setup_django ( out_dir : Path | None = None , check_db = False , config : benedict = CONFIG , in_memory_db = False ) - > None :
2024-10-08 06:45:11 +00:00
from rich . panel import Panel
2024-09-25 02:04:38 +00:00
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
2024-09-27 07:41:21 +00:00
global DJANGO_SET_UP
if DJANGO_SET_UP :
2024-10-03 10:43:02 +00:00
# raise Exception('django is already set up!')
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
return
2024-09-27 07:41:21 +00:00
2024-10-08 06:45:11 +00:00
with Progress ( transient = True , expand = True , console = STDERR ) as INITIAL_STARTUP_PROGRESS :
2024-09-25 02:04:38 +00:00
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS . add_task ( " [green]Loading modules... " , total = 25 )
2020-07-13 15:24:36 +00:00
2024-09-30 22:59:05 +00:00
output_dir = out_dir or CONSTANTS . DATA_DIR
2019-04-24 08:09:25 +00:00
2024-09-30 22:59:05 +00:00
assert isinstance ( output_dir , Path ) and isinstance ( CONSTANTS . PACKAGE_DIR , Path )
2019-04-24 08:09:25 +00:00
2024-09-25 02:04:38 +00:00
bump_startup_progress_bar ( )
try :
from django . core . management import call_command
bump_startup_progress_bar ( )
if in_memory_db :
2024-09-27 07:41:21 +00:00
raise Exception ( ' dont use this anymore ' )
2024-09-25 02:04:38 +00:00
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os . environ . setdefault ( " ARCHIVEBOX_DATABASE_NAME " , " :memory: " )
django . setup ( )
bump_startup_progress_bar ( )
call_command ( " migrate " , interactive = False , verbosity = 0 )
else :
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
2024-10-08 06:45:11 +00:00
try :
django . setup ( )
except Exception as e :
bump_startup_progress_bar ( advance = 1000 )
STDERR . print ( )
STDERR . print ( Panel ( f ' \n [red] { e . __class__ . __name__ } [/red]: [yellow] { e } [/yellow] \n Please check your config and [blue]DATA_DIR[/blue] permissions. \n ' , title = ' \n \n [red][X] Error while trying to load database! ' , subtitle = ' [grey53]NO WRITES CAN BE PERFORMED[/grey53] ' , expand = False , style = ' bold red ' ) )
STDERR . print ( )
return
2024-09-25 02:04:38 +00:00
bump_startup_progress_bar ( )
2019-04-24 15:36:14 +00:00
2024-09-25 02:04:38 +00:00
from django . conf import settings
2024-09-25 12:10:09 +00:00
2024-09-25 02:04:38 +00:00
# log startup message to the error log
with open ( settings . ERROR_LOG , " a " , encoding = ' utf-8 ' ) as f :
command = ' ' . join ( sys . argv )
ts = datetime . now ( timezone . utc ) . strftime ( ' % Y- % m- %d __ % H: % M: % S ' )
2024-09-30 22:59:05 +00:00
f . write ( f " \n > { command } ; TS= { ts } VERSION= { CONSTANTS . VERSION } IN_DOCKER= { SHELL_CONFIG . IN_DOCKER } IS_TTY= { SHELL_CONFIG . IS_TTY } \n " )
2020-12-08 23:05:37 +00:00
2024-09-25 02:04:38 +00:00
if check_db :
2024-10-08 06:45:11 +00:00
# make sure the data dir is owned by a non-root user
if CONSTANTS . DATA_DIR . stat ( ) . st_uid == 0 :
STDERR . print ( ' [red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red] ' )
STDERR . print ( f ' { CONSTANTS . DATA_DIR } ' )
STDERR . print ( )
STDERR . print ( ' [violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?) ' )
STDERR . print ( ' cd path/to/your/archive/data ' )
STDERR . print ( ' archivebox [command] ' )
STDERR . print ( )
raise SystemExit ( 9 )
2024-09-25 02:04:38 +00:00
# Create cache table in DB if needed
try :
from django . core . cache import cache
cache . get ( ' test ' , None )
except django . db . utils . OperationalError :
call_command ( " createcachetable " , verbosity = 0 )
2021-03-01 03:53:34 +00:00
2024-09-25 02:04:38 +00:00
bump_startup_progress_bar ( )
2021-03-01 03:53:34 +00:00
2024-09-25 02:04:38 +00:00
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django . db import connections
for conn in connections . all ( ) :
conn . close_if_unusable_or_obsolete ( )
2021-02-16 06:22:36 +00:00
2024-09-30 22:59:05 +00:00
sql_index_path = CONSTANTS . DATABASE_FILE
2024-09-25 02:04:38 +00:00
assert sql_index_path . exists ( ) , (
2024-09-30 22:59:05 +00:00
f ' No database file { sql_index_path } found in: { CONSTANTS . DATA_DIR } (Are you in an ArchiveBox collection directory?) ' )
2024-08-27 03:14:47 +00:00
2024-09-25 02:04:38 +00:00
bump_startup_progress_bar ( )
2024-08-27 03:14:47 +00:00
2024-09-25 02:04:38 +00:00
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
2024-10-04 00:39:39 +00:00
# if settings.DEBUG_LOGFIRE:
# from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
# SQLite3Instrumentor().instrument()
2024-08-27 03:14:47 +00:00
2024-10-04 00:39:39 +00:00
# import logfire
2024-09-25 02:04:38 +00:00
2024-10-04 00:39:39 +00:00
# logfire.configure()
# logfire.instrument_django(is_sql_commentor_enabled=True)
# logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
2024-09-25 02:04:38 +00:00
except KeyboardInterrupt :
raise SystemExit ( 2 )
2024-09-27 07:41:21 +00:00
DJANGO_SET_UP = True
2024-08-27 03:14:47 +00:00
2024-09-25 02:04:38 +00:00
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = None