ArchiveBox/archivebox/config.py

1003 lines
45 KiB
Python
Raw Normal View History

2020-12-20 01:11:19 +00:00
"""
ArchiveBox config definitons (including defaults and dynamic config options).
Config Usage Example:
archivebox config --set MEDIA_TIMEOUT=600
env MEDIA_TIMEOUT=600 USE_COLOR=False ... archivebox [subcommand] ...
Config Precedence Order:
1. cli args (--update-all / --index-only / etc.)
2. shell environment vars (env USE_COLOR=False archivebox add '...')
3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf)
4. defaults (defined below in Python)
Documentation:
https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
"""
__package__ = 'archivebox'
import os
2019-04-22 18:34:12 +00:00
import io
import re
2017-07-04 10:48:12 +00:00
import sys
import json
2017-07-05 21:33:51 +00:00
import shutil
import archivebox
2017-07-04 10:48:12 +00:00
2019-04-22 18:34:12 +00:00
from hashlib import md5
2020-07-28 11:20:57 +00:00
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict
2024-09-21 08:53:59 +00:00
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
from configparser import ConfigParser
from pydantic_pkgr import SemVer
from rich.progress import Progress
from rich.console import Console
import django
from django.db.backends.sqlite3.base import Database as sqlite3
from .config_stubs import (
2024-08-21 01:31:21 +00:00
AttrDict,
ConfigValue,
ConfigDict,
ConfigDefaultValue,
ConfigDefaultDict,
)
from .misc.logging import (
DEFAULT_CLI_COLORS,
ANSI,
COLOR_DICT,
stderr,
hint, # noqa
)
2024-09-21 08:53:59 +00:00
# print('STARTING CONFIG LOADING')
2024-08-23 09:01:02 +00:00
# load fallback libraries from vendor dir
from .vendor import load_vendored_libs
load_vendored_libs()
2024-09-21 08:53:59 +00:00
# print("LOADED VENDOR LIBS")
2020-12-20 01:11:19 +00:00
############################### Config Schema ##################################
2020-12-20 01:11:19 +00:00
CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SHELL_CONFIG': {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False},
'IN_QEMU': {'type': bool, 'default': False},
'PUID': {'type': int, 'default': os.getuid()},
'PGID': {'type': int, 'default': os.getgid()},
},
'GENERAL_CONFIG': {
'OUTPUT_DIR': {'type': str, 'default': None},
'CONFIG_FILE': {'type': str, 'default': None},
2019-04-27 21:26:24 +00:00
'ONLY_NEW': {'type': bool, 'default': True},
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, # TODO: move this to be a default WGET_ARGS
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
2022-01-11 01:42:09 +00:00
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
},
'SERVER_CONFIG': {
'ADMIN_USERNAME': {'type': str, 'default': None},
'ADMIN_PASSWORD': {'type': str, 'default': None},
'SECRET_KEY': {'type': str, 'default': None},
'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com
'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
'DEBUG': {'type': bool, 'default': False},
'PUBLIC_INDEX': {'type': bool, 'default': True},
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'},
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
2022-03-31 19:40:14 +00:00
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
2023-08-17 02:53:49 +00:00
'LDAP': {'type': bool, 'default': False},
'LDAP_SERVER_URI': {'type': str, 'default': None},
'LDAP_BIND_DN': {'type': str, 'default': None},
'LDAP_BIND_PASSWORD': {'type': str, 'default': None},
'LDAP_USER_BASE': {'type': str, 'default': None},
'LDAP_USER_FILTER': {'type': str, 'default': None},
'LDAP_USERNAME_ATTR': {'type': str, 'default': None},
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
},
'ARCHIVE_METHOD_TOGGLES': {
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)},
2020-09-22 08:46:21 +00:00
'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)},
'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)},
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
2020-09-24 13:37:27 +00:00
'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
'SAVE_DENYLIST': {'type': dict, 'default': {},},
},
'ARCHIVE_METHOD_OPTIONS': {
2024-01-23 21:51:56 +00:00
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'},
'COOKIES_FILE': {'type': str, 'default': None},
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
'--restrict-filenames',
2024-01-31 09:59:43 +00:00
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
2021-04-10 10:56:19 +00:00
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
2022-09-12 20:36:23 +00:00
# This flag doesn't exist in youtube-dl
# only in yt-dlp
'--no-abort-on-error',
2022-09-14 04:27:58 +00:00
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
]},
2024-02-23 21:40:31 +00:00
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]},
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
'CURL_ARGS': {'type': list, 'default': ['--silent',
'--location',
'--compressed'
2020-10-15 13:58:22 +00:00
]},
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
2020-10-15 13:58:22 +00:00
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
},
2020-11-19 13:06:13 +00:00
'SEARCH_BACKEND_CONFIG' : {
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
2020-11-19 13:06:13 +00:00
'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
'SEARCH_PROCESS_HTML': {'type': bool, 'default': True},
2020-11-19 13:06:13 +00:00
# SONIC
'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
# SQLite3 FTS5
'FTS_SEPARATE_DATABASE': {'type': bool, 'default': True},
'FTS_TOKENIZERS': {'type': str, 'default': 'porter unicode61 remove_diacritics 2'},
# Default from https://www.sqlite.org/limits.html#max_length
'FTS_SQLITE_MAX_LENGTH': {'type': int, 'default': int(1e9)},
2020-11-19 13:06:13 +00:00
},
'DEPENDENCY_CONFIG': {
'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True},
'USE_SINGLEFILE': {'type': bool, 'default': True},
'USE_READABILITY': {'type': bool, 'default': True},
2020-09-22 08:46:21 +00:00
'USE_MERCURY': {'type': bool, 'default': True},
'USE_GIT': {'type': bool, 'default': True},
'USE_CHROME': {'type': bool, 'default': True},
'USE_NODE': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True},
2020-12-12 12:36:31 +00:00
'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
'NODE_BINARY': {'type': str, 'default': 'node'},
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
# 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
},
}
2020-12-20 01:11:19 +00:00
########################## Backwards-Compatibility #############################
# for backwards compatibility with old config files, check old/deprecated names for each key
CONFIG_ALIASES = {
alias: key
2020-12-20 01:11:19 +00:00
for section in CONFIG_SCHEMA.values()
for key, default in section.items()
for alias in default.get('aliases', ())
}
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
def get_real_name(key: str) -> str:
2020-12-20 01:11:19 +00:00
"""get the current canonical name for a given deprecated config key"""
return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
2020-12-20 01:11:19 +00:00
################################ Constants #####################################
PACKAGE_DIR_NAME = 'archivebox'
2021-01-30 10:34:19 +00:00
TEMPLATES_DIR_NAME = 'templates'
2020-12-20 01:11:19 +00:00
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
2024-05-12 08:42:34 +00:00
CACHE_DIR_NAME = 'cache'
2024-09-21 08:53:59 +00:00
LIB_DIR_NAME = 'lib'
PERSONAS_DIR_NAME = 'personas'
CRONTABS_DIR_NAME = 'crontabs'
2020-12-20 01:11:19 +00:00
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
ROBOTS_TXT_FILENAME = 'robots.txt'
FAVICON_FILENAME = 'favicon.ico'
CONFIG_FILENAME = 'ArchiveBox.conf'
2019-03-27 20:44:00 +00:00
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
CONSTANTS = archivebox.CONSTANTS._asdict()
2024-08-21 01:31:21 +00:00
############################## Version Config ##################################
2020-12-20 01:11:19 +00:00
############################## Derived Config ##################################
2024-08-21 01:31:21 +00:00
2023-12-19 06:04:11 +00:00
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
2020-12-20 01:11:19 +00:00
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
**{
key: {'default': lambda c: val}
for key, val in archivebox.CONSTANTS.items()
},
2024-08-21 01:31:21 +00:00
'PACKAGE_DIR': {'default': lambda c: archivebox.PACKAGE_DIR.resolve()},
2020-12-11 14:21:09 +00:00
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
2023-12-19 06:04:11 +00:00
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
2020-07-30 20:55:24 +00:00
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
# 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
# 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
2020-09-22 08:46:21 +00:00
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
2020-09-22 08:46:21 +00:00
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'USE_NODE': {'default': lambda c: True},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
2019-03-27 20:44:00 +00:00
2024-09-21 08:53:59 +00:00
# print("FINISHED DEFINING SCHEMAS")
################################### Helpers ####################################
2020-12-20 01:11:19 +00:00
def load_config_val(key: str,
default: ConfigDefaultValue=None,
type: Optional[Type]=None,
aliases: Optional[Tuple[str, ...]]=None,
config: Optional[ConfigDict]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
2020-07-13 15:22:07 +00:00
"""parse bool, int, and str key=value pairs from env"""
2023-12-19 06:04:11 +00:00
assert isinstance(config, dict)
2023-12-19 06:04:11 +00:00
is_read_only = type is None
if is_read_only:
if callable(default):
return default(config)
return default
# get value from environment variables or config files
config_keys_to_check = (key, *(aliases or ()))
2023-12-19 06:04:11 +00:00
val = None
for key in config_keys_to_check:
if env_vars:
val = env_vars.get(key)
if val:
break
2023-12-19 06:04:11 +00:00
if config_file_vars:
val = config_file_vars.get(key)
if val:
break
2023-12-19 06:04:11 +00:00
is_unset = val is None
if is_unset:
if callable(default):
return default(config)
return default
2023-12-19 06:04:11 +00:00
# calculate value based on expected type
BOOL_TRUEIES = ('true', 'yes', '1')
BOOL_FALSEIES = ('false', 'no', '0')
if type is bool:
if val.lower() in BOOL_TRUEIES:
return True
2023-12-19 06:04:11 +00:00
elif val.lower() in BOOL_FALSEIES:
return False
else:
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
elif type is str:
2023-12-19 06:04:11 +00:00
if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES):
raise ValueError(f'Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)')
return val.strip()
elif type is int:
2023-12-19 06:04:11 +00:00
if not val.strip().isdigit():
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
2023-12-19 06:04:11 +00:00
return int(val.strip())
elif type is list or type is dict:
return json.loads(val)
2023-12-19 06:04:11 +00:00
raise Exception('Config values can only be str, bool, int, or json')
def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
config_path = archivebox.CONSTANTS.CONFIG_FILE
if config_path.exists():
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(config_path)
# flatten into one namespace
2024-08-21 01:31:21 +00:00
config_file_vars = ConfigDict({
key.upper(): val
for section, options in config_file.items()
for key, val in options.items()
2024-08-21 01:31:21 +00:00
})
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None
def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
from .system import atomic_write
2020-06-30 06:04:16 +00:00
2020-12-20 01:11:19 +00:00
CONFIG_HEADER = (
"""# This is the config file for your ArchiveBox collection.
#
# You can add options here manually in INI format, or automatically by running:
# archivebox config --set KEY=VALUE
#
2020-12-20 01:11:19 +00:00
# If you modify this file manually, make sure to update your archive after by running:
# archivebox init
#
# A list of all possible config with documentation and examples can be found here:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
""")
config_path = archivebox.CONSTANTS.CONFIG_FILE
if not config_path.exists():
atomic_write(config_path, CONFIG_HEADER)
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(config_path)
with open(config_path, 'r', encoding='utf-8') as old:
atomic_write(f'{config_path}.bak', old.read())
2020-12-20 01:11:19 +00:00
find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
# Set up sections in empty config file
for key, val in config.items():
section = find_section(key)
if section in config_file:
existing_config = dict(config_file[section])
else:
existing_config = {}
2024-08-21 01:31:21 +00:00
config_file[section] = ConfigDict({**existing_config, key: val})
# always make sure there's a SECRET_KEY defined for Django
existing_secret_key = None
if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
from django.utils.crypto import get_random_string
chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
random_secret_key = get_random_string(50, chars)
if 'SERVER_CONFIG' in config_file:
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
else:
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
with open(config_path, 'w+', encoding='utf-8') as new:
2020-06-30 06:04:16 +00:00
config_file.write(new)
try:
# validate the config by attempting to re-parse it
CONFIG = load_all_config()
2021-04-08 10:08:17 +00:00
except BaseException: # lgtm [py/catch-base-exception]
# something went horribly wrong, rever to the previous version
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
atomic_write(config_path, old.read())
2021-04-08 10:08:17 +00:00
raise
2020-09-30 19:54:51 +00:00
if Path(f'{config_path}.bak').exists():
os.remove(f'{config_path}.bak')
2021-04-09 16:15:47 +00:00
return {
key.upper(): CONFIG.get(key.upper())
for key in config.keys()
}
def load_config(defaults: ConfigDefaultDict,
config: Optional[ConfigDict]=None,
out_dir: Optional[str]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
env_vars = env_vars or os.environ
config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
extended_config: ConfigDict = config.copy() if config else {}
for key, default in defaults.items():
try:
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG KEY:', key, 'DEFAULT=', default)
extended_config[key] = load_config_val(
key,
default=default['default'],
type=default.get('type'),
aliases=default.get('aliases'),
config=extended_config,
env_vars=env_vars,
config_file_vars=config_file_vars,
)
except KeyboardInterrupt:
raise SystemExit(0)
except Exception as e:
stderr()
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
stderr(' {}: {}'.format(e.__class__.__name__, e))
stderr()
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
stderr()
stderr(' For config documentation and examples see:')
2020-11-23 07:04:39 +00:00
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
stderr()
2021-04-06 17:47:26 +00:00
# raise
# raise SystemExit(2)
2024-08-21 01:31:21 +00:00
return AttrDict(extended_config)
2024-01-03 01:17:35 +00:00
def parse_version_string(version: str) -> Tuple[int, int, int]:
2023-12-19 17:57:08 +00:00
"""parses a version tag string formatted like 'vx.x.x' into (major, minor, patch) ints"""
2024-01-04 19:41:12 +00:00
base = version.split('+')[0].split('v')[-1] # remove 'v' prefix and '+editable' suffix
2023-12-19 17:57:08 +00:00
return tuple(int(part) for part in base.split('.'))[:3]
2020-12-20 01:11:19 +00:00
2019-03-26 09:31:27 +00:00
2020-12-20 01:11:19 +00:00
# Dependency Metadata Helpers
def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]:
2019-03-26 09:31:27 +00:00
"""check the presence and return valid version line of a specified binary"""
abspath = bin_path(binary)
2020-08-18 12:21:55 +00:00
if not binary or not abspath:
return None
return '999.999.999'
# Now handled by new BinProvider plugin system, no longer needed:
try:
bin_env = os.environ | {'LANG': 'C'}
2024-08-23 00:57:33 +00:00
is_cmd_str = cmd and isinstance(cmd, str)
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env)
.stdout.strip()
.decode()
)
if not version_str:
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT)
.stdout.strip()
.decode()
)
2024-08-23 00:57:33 +00:00
# take first 3 columns of first line of version info
semver = SemVer.parse(version_str)
if semver:
return str(semver)
except (OSError, TimeoutExpired):
2020-08-18 12:21:55 +00:00
pass
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
2020-11-23 07:04:39 +00:00
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
2020-08-18 12:21:55 +00:00
return None
def bin_path(binary: Optional[str]) -> Optional[str]:
if binary is None:
return None
2020-08-18 22:14:56 +00:00
node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
return 'UNUSED'
# DEPRECATED: now handled by new BinProvider plugin system, no longer needed:
if binary is None:
return None
abs_path = bin_path(binary)
2020-07-28 11:20:57 +00:00
if abs_path is None or not Path(abs_path).exists():
2019-04-22 18:34:12 +00:00
return None
file_hash = md5()
with io.open(abs_path, mode='rb') as f:
2019-04-22 18:34:12 +00:00
for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
file_hash.update(chunk)
2019-04-22 18:34:12 +00:00
return f'md5:{file_hash.hexdigest()}'
def find_chrome_binary() -> Optional[str]:
2019-03-26 09:31:27 +00:00
"""find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
2019-03-30 19:03:31 +00:00
# make sure data dir finding precedence order always matches binary finding order
2019-03-26 09:31:27 +00:00
default_executable_paths = (
2023-11-14 10:04:49 +00:00
# '~/Library/Caches/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
2019-03-26 09:31:27 +00:00
'chromium-browser',
'chromium',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
2020-06-26 01:30:29 +00:00
'chrome',
2019-03-26 09:31:27 +00:00
'google-chrome',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'google-chrome-stable',
'google-chrome-beta',
'google-chrome-canary',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
'google-chrome-unstable',
'google-chrome-dev',
)
for name in default_executable_paths:
full_path_exists = shutil.which(name)
if full_path_exists:
return name
return None
2019-03-26 09:31:27 +00:00
def find_chrome_data_dir() -> Optional[str]:
"""find any installed chrome user data directories in the default locations"""
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
# Going forward we want to discourage people from using their main chrome profile for archiving.
# Session tokens, personal data, and cookies are often returned in server responses,
# when they get archived, they are essentially burned as anyone who can view the archive
# can use that data to masquerade as the logged-in user that did the archiving.
# For this reason users should always create dedicated burner profiles for archiving and not use
# their daily driver main accounts.
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# # make sure data dir finding precedence order always matches binary finding order
# default_profile_paths = (
# '~/.config/chromium',
# '~/Library/Application Support/Chromium',
# '~/AppData/Local/Chromium/User Data',
# '~/.config/chrome',
# '~/.config/google-chrome',
# '~/Library/Application Support/Google/Chrome',
# '~/AppData/Local/Google/Chrome/User Data',
# '~/.config/google-chrome-stable',
# '~/.config/google-chrome-beta',
# '~/Library/Application Support/Google/Chrome Canary',
# '~/AppData/Local/Google/Chrome SxS/User Data',
# '~/.config/google-chrome-unstable',
# '~/.config/google-chrome-dev',
# )
# for path in default_profile_paths:
# full_path = Path(path).resolve()
# if full_path.exists():
# return full_path
2019-03-26 09:31:27 +00:00
return None
def wget_supports_compression(config):
try:
cmd = [
config['WGET_BINARY'],
"--compression=auto",
"--help",
]
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
except (FileNotFoundError, OSError):
return False
def get_dependency_info(config: ConfigDict) -> ConfigValue:
return {
# 'PYTHON_BINARY': {
# 'path': bin_path(config['PYTHON_BINARY']),
# 'version': config['PYTHON_VERSION'],
# 'hash': bin_hash(config['PYTHON_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['PYTHON_VERSION']),
# },
# 'SQLITE_BINARY': {
# 'path': bin_path(config['SQLITE_BINARY']),
# 'version': config['SQLITE_VERSION'],
# 'hash': bin_hash(config['SQLITE_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['SQLITE_VERSION']),
# },
# 'DJANGO_BINARY': {
# 'path': bin_path(config['DJANGO_BINARY']),
# 'version': config['DJANGO_VERSION'],
# 'hash': bin_hash(config['DJANGO_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['DJANGO_VERSION']),
# },
# 'ARCHIVEBOX_BINARY': {
# 'path': bin_path(config['ARCHIVEBOX_BINARY']),
# 'version': config['VERSION'],
# 'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
# 'enabled': True,
# 'is_valid': True,
# },
2022-06-09 03:17:38 +00:00
'CURL_BINARY': {
'path': bin_path(config['CURL_BINARY']),
'version': config['CURL_VERSION'],
2021-01-20 15:24:34 +00:00
'hash': bin_hash(config['CURL_BINARY']),
'enabled': config['USE_CURL'],
'is_valid': bool(config['CURL_VERSION']),
},
'WGET_BINARY': {
'path': bin_path(config['WGET_BINARY']),
'version': config['WGET_VERSION'],
'hash': bin_hash(config['WGET_BINARY']),
'enabled': config['USE_WGET'],
'is_valid': bool(config['WGET_VERSION']),
},
'NODE_BINARY': {
'path': bin_path(config['NODE_BINARY']),
'version': config['NODE_VERSION'],
'hash': bin_hash(config['NODE_BINARY']),
'enabled': config['USE_NODE'],
2021-01-20 15:24:34 +00:00
'is_valid': bool(config['NODE_VERSION']),
},
2020-09-22 08:46:21 +00:00
'MERCURY_BINARY': {
'path': bin_path(config['MERCURY_BINARY']),
'version': config['MERCURY_VERSION'],
'hash': bin_hash(config['MERCURY_BINARY']),
'enabled': config['USE_MERCURY'],
'is_valid': bool(config['MERCURY_VERSION']),
},
'GIT_BINARY': {
'path': bin_path(config['GIT_BINARY']),
'version': config['GIT_VERSION'],
'hash': bin_hash(config['GIT_BINARY']),
'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']),
},
# 'SINGLEFILE_BINARY': {
# 'path': bin_path(config['SINGLEFILE_BINARY']),
# 'version': config['SINGLEFILE_VERSION'],
# 'hash': bin_hash(config['SINGLEFILE_BINARY']),
# 'enabled': config['USE_SINGLEFILE'],
# 'is_valid': bool(config['SINGLEFILE_VERSION']),
# },
# 'READABILITY_BINARY': {
# 'path': bin_path(config['READABILITY_BINARY']),
# 'version': config['READABILITY_VERSION'],
# 'hash': bin_hash(config['READABILITY_BINARY']),
# 'enabled': config['USE_READABILITY'],
# 'is_valid': bool(config['READABILITY_VERSION']),
# },
# 'YOUTUBEDL_BINARY': {
# 'path': bin_path(config['YOUTUBEDL_BINARY']),
# 'version': config['YOUTUBEDL_VERSION'],
# 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
# 'enabled': config['USE_YOUTUBEDL'],
# 'is_valid': bool(config['YOUTUBEDL_VERSION']),
# },
# 'CHROME_BINARY': {
# 'path': bin_path(config['CHROME_BINARY']),
# 'version': config['CHROME_VERSION'],
# 'hash': bin_hash(config['CHROME_BINARY']),
# 'enabled': config['USE_CHROME'],
# 'is_valid': bool(config['CHROME_VERSION']),
# },
# 'RIPGREP_BINARY': {
# 'path': bin_path(config['RIPGREP_BINARY']),
# 'version': config['RIPGREP_VERSION'],
# 'hash': bin_hash(config['RIPGREP_BINARY']),
# 'enabled': config['USE_RIPGREP'],
# 'is_valid': bool(config['RIPGREP_VERSION']),
# },
2020-12-20 01:11:19 +00:00
# 'SONIC_BINARY': {
# 'path': bin_path(config['SONIC_BINARY']),
# 'version': config['SONIC_VERSION'],
# 'hash': bin_hash(config['SONIC_BINARY']),
# 'enabled': config['USE_SONIC'],
# 'is_valid': bool(config['SONIC_VERSION']),
# },
}
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************** Load Config *********************************
# ******* (compile the defaults, configs, and metadata all into CONFIG) ********
# ******************************************************************************
# ******************************************************************************
def load_all_config():
2024-08-21 01:31:21 +00:00
CONFIG: ConfigDict = ConfigDict()
2020-12-20 01:11:19 +00:00
for section_name, section_config in CONFIG_SCHEMA.items():
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG SECTION:', section_name)
CONFIG = load_config(section_config, CONFIG)
2024-09-21 08:53:59 +00:00
# print("LOADING CONFIG SECTION:", 'DYNAMIC')
2020-12-20 01:11:19 +00:00
return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG)
2020-12-20 01:11:19 +00:00
# add all final config values in CONFIG to globals in this file
2024-08-21 01:31:21 +00:00
CONFIG: ConfigDict = load_all_config()
globals().update(CONFIG)
2020-12-20 01:11:19 +00:00
# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
2024-09-21 08:53:59 +00:00
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
########################### System Environment Setup ###########################
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
2024-01-04 19:41:12 +00:00
assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # noqa: F821
os.environ["TZ"] = TIMEZONE # noqa: F821
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
2020-08-11 03:21:02 +00:00
2020-12-20 01:11:19 +00:00
########################### Config Validity Checkers ###########################
if not CONFIG.USE_COLOR:
os.environ['NO_COLOR'] = '1'
if not CONFIG.SHOW_PROGRESS:
os.environ['TERM'] = 'dumb'
# recreate rich console obj based on new config values
CONSOLE = Console()
from .misc import logging
logging.CONSOLE = CONSOLE
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
def bump_startup_progress_bar():
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
def setup_django_minimal():
sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
output_dir = out_dir or archivebox.DATA_DIR
assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path)
bump_startup_progress_bar()
try:
from django.core.management import call_command
sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
bump_startup_progress_bar()
if in_memory_db:
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
django.setup()
bump_startup_progress_bar()
call_command("migrate", interactive=False, verbosity=0)
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django.setup()
bump_startup_progress_bar()
from django.conf import settings
from plugins_sys.config.apps import SHELL_CONFIG
2021-02-16 09:15:09 +00:00
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db:
# Create cache table in DB if needed
try:
from django.core.cache import cache
cache.get('test', None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
2021-03-01 03:53:34 +00:00
bump_startup_progress_bar()
2021-03-01 03:53:34 +00:00
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
sql_index_path = archivebox.CONSTANTS.DATABASE_FILE
assert sql_index_path.exists(), (
f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
bump_startup_progress_bar()
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
if settings.DEBUG_LOGFIRE:
from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
SQLite3Instrumentor().instrument()
import logfire
logfire.configure()
logfire.instrument_django(is_sql_commentor_enabled=True)
logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv)
except KeyboardInterrupt:
raise SystemExit(2)
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = None