ArchiveBox/archivebox/config.py

1412 lines
66 KiB
Python
Raw Normal View History

2020-12-20 01:11:19 +00:00
"""
ArchiveBox config definitons (including defaults and dynamic config options).
Config Usage Example:
archivebox config --set MEDIA_TIMEOUT=600
env MEDIA_TIMEOUT=600 USE_COLOR=False ... archivebox [subcommand] ...
Config Precedence Order:
1. cli args (--update-all / --index-only / etc.)
2. shell environment vars (env USE_COLOR=False archivebox add '...')
3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf)
4. defaults (defined below in Python)
Documentation:
https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
"""
__package__ = 'archivebox'
import os
2019-04-22 18:34:12 +00:00
import io
import re
2017-07-04 10:48:12 +00:00
import sys
import json
2022-06-09 01:35:31 +00:00
import inspect
import getpass
2017-07-05 21:33:51 +00:00
import shutil
import requests
2017-07-04 10:48:12 +00:00
2019-04-22 18:34:12 +00:00
from hashlib import md5
2020-07-28 11:20:57 +00:00
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict
2024-09-21 08:53:59 +00:00
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
from configparser import ConfigParser
import importlib.metadata
from pydantic_pkgr import SemVer
from rich.progress import Progress
import django
from django.db.backends.sqlite3.base import Database as sqlite3
from .config_stubs import (
2024-08-21 01:31:21 +00:00
AttrDict,
SimpleConfigValueDict,
ConfigValue,
ConfigDict,
ConfigDefaultValue,
ConfigDefaultDict,
)
from .misc.logging import (
CONSOLE,
SHOW_PROGRESS,
DEFAULT_CLI_COLORS,
ANSI,
COLOR_DICT,
stderr,
hint,
)
from .misc.checks import check_system_config
2024-09-21 08:53:59 +00:00
# print('STARTING CONFIG LOADING')
2024-08-23 09:01:02 +00:00
# load fallback libraries from vendor dir
from .vendor import load_vendored_libs
load_vendored_libs()
2024-09-21 08:53:59 +00:00
# print("LOADED VENDOR LIBS")
2020-12-20 01:11:19 +00:00
############################### Config Schema ##################################
2020-12-20 01:11:19 +00:00
CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SHELL_CONFIG': {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False},
'IN_QEMU': {'type': bool, 'default': False},
'PUID': {'type': int, 'default': os.getuid()},
'PGID': {'type': int, 'default': os.getgid()},
},
'GENERAL_CONFIG': {
'OUTPUT_DIR': {'type': str, 'default': None},
'CONFIG_FILE': {'type': str, 'default': None},
2019-04-27 21:26:24 +00:00
'ONLY_NEW': {'type': bool, 'default': True},
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, # TODO: move this to be a default WGET_ARGS
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
2022-01-11 01:42:09 +00:00
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
},
'SERVER_CONFIG': {
'ADMIN_USERNAME': {'type': str, 'default': None},
'ADMIN_PASSWORD': {'type': str, 'default': None},
'SECRET_KEY': {'type': str, 'default': None},
'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com
'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
'DEBUG': {'type': bool, 'default': False},
'PUBLIC_INDEX': {'type': bool, 'default': True},
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'},
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
2022-03-31 19:40:14 +00:00
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
2023-08-17 02:53:49 +00:00
'LDAP': {'type': bool, 'default': False},
'LDAP_SERVER_URI': {'type': str, 'default': None},
'LDAP_BIND_DN': {'type': str, 'default': None},
'LDAP_BIND_PASSWORD': {'type': str, 'default': None},
'LDAP_USER_BASE': {'type': str, 'default': None},
'LDAP_USER_FILTER': {'type': str, 'default': None},
'LDAP_USERNAME_ATTR': {'type': str, 'default': None},
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
},
'ARCHIVE_METHOD_TOGGLES': {
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)},
2020-09-22 08:46:21 +00:00
'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)},
'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)},
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
2020-09-24 13:37:27 +00:00
'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
'SAVE_DENYLIST': {'type': dict, 'default': {},},
},
'ARCHIVE_METHOD_OPTIONS': {
2024-01-23 21:51:56 +00:00
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
2023-03-15 23:01:02 +00:00
'CHROME_TIMEOUT': {'type': int, 'default': 0},
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
'--restrict-filenames',
2024-01-31 09:59:43 +00:00
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
2021-04-10 10:56:19 +00:00
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
2022-09-12 20:36:23 +00:00
# This flag doesn't exist in youtube-dl
# only in yt-dlp
'--no-abort-on-error',
2022-09-14 04:27:58 +00:00
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
]},
2024-02-23 21:40:31 +00:00
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]},
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
'CURL_ARGS': {'type': list, 'default': ['--silent',
'--location',
'--compressed'
2020-10-15 13:58:22 +00:00
]},
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
2020-10-15 13:58:22 +00:00
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
},
2020-11-19 13:06:13 +00:00
'SEARCH_BACKEND_CONFIG' : {
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
2020-11-19 13:06:13 +00:00
'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
'SEARCH_PROCESS_HTML': {'type': bool, 'default': True},
2020-11-19 13:06:13 +00:00
# SONIC
'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
# SQLite3 FTS5
'FTS_SEPARATE_DATABASE': {'type': bool, 'default': True},
'FTS_TOKENIZERS': {'type': str, 'default': 'porter unicode61 remove_diacritics 2'},
# Default from https://www.sqlite.org/limits.html#max_length
'FTS_SQLITE_MAX_LENGTH': {'type': int, 'default': int(1e9)},
2020-11-19 13:06:13 +00:00
},
'DEPENDENCY_CONFIG': {
'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True},
'USE_SINGLEFILE': {'type': bool, 'default': True},
'USE_READABILITY': {'type': bool, 'default': True},
2020-09-22 08:46:21 +00:00
'USE_MERCURY': {'type': bool, 'default': True},
'USE_GIT': {'type': bool, 'default': True},
'USE_CHROME': {'type': bool, 'default': True},
'USE_NODE': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True},
2020-12-12 12:36:31 +00:00
'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'},
2020-12-12 12:36:31 +00:00
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
},
}
2020-12-20 01:11:19 +00:00
########################## Backwards-Compatibility #############################
# for backwards compatibility with old config files, check old/deprecated names for each key
CONFIG_ALIASES = {
alias: key
2020-12-20 01:11:19 +00:00
for section in CONFIG_SCHEMA.values()
for key, default in section.items()
for alias in default.get('aliases', ())
}
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
def get_real_name(key: str) -> str:
2020-12-20 01:11:19 +00:00
"""get the current canonical name for a given deprecated config key"""
return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
2020-12-20 01:11:19 +00:00
################################ Constants #####################################
PACKAGE_DIR_NAME = 'archivebox'
2021-01-30 10:34:19 +00:00
TEMPLATES_DIR_NAME = 'templates'
2020-12-20 01:11:19 +00:00
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
2024-05-12 08:42:34 +00:00
CACHE_DIR_NAME = 'cache'
2024-09-21 08:53:59 +00:00
LIB_DIR_NAME = 'lib'
PERSONAS_DIR_NAME = 'personas'
CRONTABS_DIR_NAME = 'crontabs'
2020-12-20 01:11:19 +00:00
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
ROBOTS_TXT_FILENAME = 'robots.txt'
FAVICON_FILENAME = 'favicon.ico'
CONFIG_FILENAME = 'ArchiveBox.conf'
2019-03-27 20:44:00 +00:00
2019-05-01 03:13:04 +00:00
STATICFILE_EXTENSIONS = {
2020-06-26 01:30:29 +00:00
# 99.999% of the time, URLs ending in these extensions are static files
2019-05-01 03:13:04 +00:00
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
2019-05-01 03:13:04 +00:00
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
2019-05-01 03:13:04 +00:00
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
2020-06-26 01:30:29 +00:00
# These are always treated as pages, not as static files, never add them:
2019-05-01 03:13:04 +00:00
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
2019-04-17 03:18:42 +00:00
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_OUTPUT_DIR = {
2024-09-10 07:37:01 +00:00
".gitignore",
"lost+found",
".DS_Store",
".venv",
"venv",
"virtualenv",
".virtualenv",
"node_modules",
"package.json",
"package-lock.json",
"yarn.lock",
"static",
"sonic",
"search.sqlite3",
CRONTABS_DIR_NAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
2024-05-12 08:42:34 +00:00
CACHE_DIR_NAME,
2024-09-21 08:53:59 +00:00
LIB_DIR_NAME,
PERSONAS_DIR_NAME,
SQL_INDEX_FILENAME,
2024-09-10 07:37:01 +00:00
f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm",
"queue.sqlite3",
"queue.sqlite3-wal",
"queue.sqlite3-shm",
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
CONFIG_FILENAME,
2024-09-10 07:37:01 +00:00
f"{CONFIG_FILENAME}.bak",
"static_index.json",
}
2019-04-26 22:31:50 +00:00
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
2024-08-21 01:31:21 +00:00
CONSTANTS = {
"PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME},
2024-09-21 08:53:59 +00:00
"LIB_DIR_NAME": {'default': lambda c: LIB_DIR_NAME},
2024-08-21 01:31:21 +00:00
"TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME},
"ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME},
"SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME},
"LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME},
"CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME},
"PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME},
"CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME},
"SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME},
"JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME},
"HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME},
"ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME},
"FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME},
"CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME},
"DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS},
"ANSI": {'default': lambda c: ANSI},
"COLOR_DICT": {'default': lambda c: COLOR_DICT},
"STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS},
"ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
# "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
2024-08-21 01:31:21 +00:00
}
############################## Version Config ##################################
def get_system_user() -> str:
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
# uid 999 is especially problematic and breaks many attempts
SYSTEM_USER = None
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
# Option 1
try:
import pwd
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
except (ModuleNotFoundError, Exception):
pass
# Option 2
try:
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
except Exception:
pass
# Option 3
try:
SYSTEM_USER = SYSTEM_USER or os.getlogin()
except Exception:
pass
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
def get_version(config):
2023-11-14 08:21:09 +00:00
try:
return importlib.metadata.version(__package__ or 'archivebox')
except importlib.metadata.PackageNotFoundError:
try:
pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"')
except FileNotFoundError:
# building docs, pyproject.toml is not available
return 'dev'
2023-11-14 08:21:09 +00:00
raise Exception('Failed to detect installed archivebox version!')
2022-06-09 08:04:55 +00:00
def get_commit_hash(config) -> Optional[str]:
2022-06-09 08:04:55 +00:00
try:
git_dir = config['PACKAGE_DIR'] / '../.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
2022-06-09 08:04:55 +00:00
except Exception:
pass
2022-06-09 08:04:55 +00:00
try:
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
return None
2022-06-09 08:04:55 +00:00
def get_build_time(config) -> str:
if config['IN_DOCKER']:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
2022-06-09 08:04:55 +00:00
2023-12-19 17:57:08 +00:00
def get_versions_available_on_github(config):
"""
2023-12-19 17:57:08 +00:00
returns a dictionary containing the ArchiveBox GitHub release info for
the recommended upgrade version and the currently installed version
"""
2023-12-19 17:57:08 +00:00
# we only want to perform the (relatively expensive) check for new versions
# when its most relevant, e.g. when the user runs a long-running command
2024-01-03 01:17:35 +00:00
subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
2023-12-19 17:57:08 +00:00
long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
if subcommand_run_by_user not in long_running_commands:
return None
github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
response = requests.get(github_releases_api)
if response.status_code != 200:
2023-12-19 17:57:08 +00:00
stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
return None
2023-12-19 17:57:08 +00:00
all_releases = response.json()
2023-12-19 17:57:08 +00:00
installed_version = parse_version_string(config['VERSION'])
# find current version or nearest older version (to link to)
current_version = None
2023-12-19 17:57:08 +00:00
for idx, release in enumerate(all_releases):
2024-01-03 01:17:35 +00:00
release_version = parse_version_string(release['tag_name'])
2023-12-19 17:57:08 +00:00
if release_version <= installed_version:
current_version = release
break
2024-01-04 19:41:12 +00:00
current_version = current_version or all_releases[-1]
2023-12-19 17:57:08 +00:00
# recommended version is whatever comes after current_version in the release list
# (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
try:
recommended_version = all_releases[idx+1]
except IndexError:
recommended_version = None
2024-01-03 01:17:35 +00:00
return {'recommended_version': recommended_version, 'current_version': current_version}
def can_upgrade(config):
2023-12-19 17:57:08 +00:00
if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
return recommended_version > current_version
return False
2020-12-20 01:11:19 +00:00
############################## Derived Config ##################################
2024-08-21 01:31:21 +00:00
2023-12-19 06:04:11 +00:00
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
2020-12-20 01:11:19 +00:00
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
2024-08-21 01:31:21 +00:00
**CONSTANTS,
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: get_system_user()},
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})},
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
2020-12-11 14:21:09 +00:00
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
2020-09-08 21:29:22 +00:00
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
2024-05-12 08:42:34 +00:00
'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
2024-09-21 08:53:59 +00:00
'LIB_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME},
'BIN_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'},
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
2020-09-08 21:29:22 +00:00
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
2023-12-19 06:04:11 +00:00
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
'NODE_BIN_PATH': {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))},
2023-12-19 17:57:08 +00:00
2023-12-19 06:04:11 +00:00
'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]}, # remove +editable from user-displayed version string
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, # short git commit hash of codebase HEAD commit
'BUILD_TIME': {'default': lambda c: get_build_time(c)}, # docker build completed time or python src last modified time
2022-06-09 08:04:55 +00:00
'VERSIONS_AVAILABLE': {'default': lambda c: False}, # get_versions_available_on_github(c)},
'CAN_UPGRADE': {'default': lambda c: False}, # can_upgrade(c)},
2023-12-19 17:57:08 +00:00
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
2022-06-09 03:12:55 +00:00
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
2022-06-09 01:35:31 +00:00
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
2022-06-09 02:11:02 +00:00
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
2023-12-19 17:57:08 +00:00
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
2022-06-09 03:12:55 +00:00
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
2020-07-30 20:55:24 +00:00
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
2020-06-26 01:30:29 +00:00
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
2020-12-12 12:36:31 +00:00
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
2020-09-22 08:46:21 +00:00
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
2020-09-22 08:46:21 +00:00
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
'CHROME_USER_AGENT': {'default': lambda c: c['CHROME_USER_AGENT'].format(**c)},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
2020-10-31 23:32:43 +00:00
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
2020-09-22 08:46:21 +00:00
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
2019-03-27 20:44:00 +00:00
2024-09-21 08:53:59 +00:00
# print("FINISHED DEFINING SCHEMAS")
################################### Helpers ####################################
2020-12-20 01:11:19 +00:00
def load_config_val(key: str,
default: ConfigDefaultValue=None,
type: Optional[Type]=None,
aliases: Optional[Tuple[str, ...]]=None,
config: Optional[ConfigDict]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
2020-07-13 15:22:07 +00:00
"""parse bool, int, and str key=value pairs from env"""
2023-12-19 06:04:11 +00:00
assert isinstance(config, dict)
2023-12-19 06:04:11 +00:00
is_read_only = type is None
if is_read_only:
if callable(default):
return default(config)
return default
# get value from environment variables or config files
config_keys_to_check = (key, *(aliases or ()))
2023-12-19 06:04:11 +00:00
val = None
for key in config_keys_to_check:
if env_vars:
val = env_vars.get(key)
if val:
break
2023-12-19 06:04:11 +00:00
if config_file_vars:
val = config_file_vars.get(key)
if val:
break
2023-12-19 06:04:11 +00:00
is_unset = val is None
if is_unset:
if callable(default):
return default(config)
return default
2023-12-19 06:04:11 +00:00
# calculate value based on expected type
BOOL_TRUEIES = ('true', 'yes', '1')
BOOL_FALSEIES = ('false', 'no', '0')
if type is bool:
if val.lower() in BOOL_TRUEIES:
return True
2023-12-19 06:04:11 +00:00
elif val.lower() in BOOL_FALSEIES:
return False
else:
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
elif type is str:
2023-12-19 06:04:11 +00:00
if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES):
raise ValueError(f'Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)')
return val.strip()
elif type is int:
2023-12-19 06:04:11 +00:00
if not val.strip().isdigit():
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
2023-12-19 06:04:11 +00:00
return int(val.strip())
elif type is list or type is dict:
return json.loads(val)
2023-12-19 06:04:11 +00:00
raise Exception('Config values can only be str, bool, int, or json')
2024-08-21 01:31:21 +00:00
def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
2024-08-21 01:31:21 +00:00
assert out_dir and out_dir.is_dir()
config_path = Path(out_dir) / CONFIG_FILENAME
if config_path.exists():
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(config_path)
# flatten into one namespace
2024-08-21 01:31:21 +00:00
config_file_vars = ConfigDict({
key.upper(): val
for section, options in config_file.items()
for key, val in options.items()
2024-08-21 01:31:21 +00:00
})
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None
2024-08-21 01:31:21 +00:00
def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
from .system import atomic_write
2020-06-30 06:04:16 +00:00
2020-12-20 01:11:19 +00:00
CONFIG_HEADER = (
"""# This is the config file for your ArchiveBox collection.
#
# You can add options here manually in INI format, or automatically by running:
# archivebox config --set KEY=VALUE
#
2020-12-20 01:11:19 +00:00
# If you modify this file manually, make sure to update your archive after by running:
# archivebox init
#
# A list of all possible config with documentation and examples can be found here:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
""")
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
config_path = Path(out_dir) / CONFIG_FILENAME
if not config_path.exists():
atomic_write(config_path, CONFIG_HEADER)
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(config_path)
with open(config_path, 'r', encoding='utf-8') as old:
atomic_write(f'{config_path}.bak', old.read())
2020-12-20 01:11:19 +00:00
find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
# Set up sections in empty config file
for key, val in config.items():
section = find_section(key)
if section in config_file:
existing_config = dict(config_file[section])
else:
existing_config = {}
2024-08-21 01:31:21 +00:00
config_file[section] = ConfigDict({**existing_config, key: val})
# always make sure there's a SECRET_KEY defined for Django
existing_secret_key = None
if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
from django.utils.crypto import get_random_string
chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
random_secret_key = get_random_string(50, chars)
if 'SERVER_CONFIG' in config_file:
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
else:
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
with open(config_path, 'w+', encoding='utf-8') as new:
2020-06-30 06:04:16 +00:00
config_file.write(new)
try:
# validate the config by attempting to re-parse it
CONFIG = load_all_config()
2021-04-08 10:08:17 +00:00
except BaseException: # lgtm [py/catch-base-exception]
# something went horribly wrong, rever to the previous version
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
atomic_write(config_path, old.read())
2021-04-08 10:08:17 +00:00
raise
2020-09-30 19:54:51 +00:00
if Path(f'{config_path}.bak').exists():
os.remove(f'{config_path}.bak')
2021-04-09 16:15:47 +00:00
return {
key.upper(): CONFIG.get(key.upper())
for key in config.keys()
}
def load_config(defaults: ConfigDefaultDict,
config: Optional[ConfigDict]=None,
out_dir: Optional[str]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
env_vars = env_vars or os.environ
config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
extended_config: ConfigDict = config.copy() if config else {}
for key, default in defaults.items():
try:
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG KEY:', key, 'DEFAULT=', default)
extended_config[key] = load_config_val(
key,
default=default['default'],
type=default.get('type'),
aliases=default.get('aliases'),
config=extended_config,
env_vars=env_vars,
config_file_vars=config_file_vars,
)
except KeyboardInterrupt:
raise SystemExit(0)
except Exception as e:
stderr()
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
stderr(' {}: {}'.format(e.__class__.__name__, e))
stderr()
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
stderr()
stderr(' For config documentation and examples see:')
2020-11-23 07:04:39 +00:00
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
stderr()
2021-04-06 17:47:26 +00:00
# raise
raise SystemExit(2)
2024-08-21 01:31:21 +00:00
return AttrDict(extended_config)
2024-01-03 01:17:35 +00:00
def parse_version_string(version: str) -> Tuple[int, int, int]:
2023-12-19 17:57:08 +00:00
"""parses a version tag string formatted like 'vx.x.x' into (major, minor, patch) ints"""
2024-01-04 19:41:12 +00:00
base = version.split('+')[0].split('v')[-1] # remove 'v' prefix and '+editable' suffix
2023-12-19 17:57:08 +00:00
return tuple(int(part) for part in base.split('.'))[:3]
2020-12-20 01:11:19 +00:00
2019-03-26 09:31:27 +00:00
2020-12-20 01:11:19 +00:00
# Dependency Metadata Helpers
def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]:
2019-03-26 09:31:27 +00:00
"""check the presence and return valid version line of a specified binary"""
abspath = bin_path(binary)
2020-08-18 12:21:55 +00:00
if not binary or not abspath:
return None
return '999.999.999'
# Now handled by new BinProvider plugin system, no longer needed:
try:
bin_env = os.environ | {'LANG': 'C'}
2024-08-23 00:57:33 +00:00
is_cmd_str = cmd and isinstance(cmd, str)
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env)
.stdout.strip()
.decode()
)
if not version_str:
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT)
.stdout.strip()
.decode()
)
2024-08-23 00:57:33 +00:00
# take first 3 columns of first line of version info
semver = SemVer.parse(version_str)
if semver:
return str(semver)
except (OSError, TimeoutExpired):
2020-08-18 12:21:55 +00:00
pass
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
2020-11-23 07:04:39 +00:00
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
2020-08-18 12:21:55 +00:00
return None
def bin_path(binary: Optional[str]) -> Optional[str]:
if binary is None:
return None
2020-08-18 22:14:56 +00:00
node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
return 'UNUSED'
# DEPRECATED: now handled by new BinProvider plugin system, no longer needed:
if binary is None:
return None
abs_path = bin_path(binary)
2020-07-28 11:20:57 +00:00
if abs_path is None or not Path(abs_path).exists():
2019-04-22 18:34:12 +00:00
return None
file_hash = md5()
with io.open(abs_path, mode='rb') as f:
2019-04-22 18:34:12 +00:00
for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
file_hash.update(chunk)
2019-04-22 18:34:12 +00:00
return f'md5:{file_hash.hexdigest()}'
def find_chrome_binary() -> Optional[str]:
2019-03-26 09:31:27 +00:00
"""find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
2019-03-30 19:03:31 +00:00
# make sure data dir finding precedence order always matches binary finding order
2019-03-26 09:31:27 +00:00
default_executable_paths = (
2023-11-14 10:04:49 +00:00
# '~/Library/Caches/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
2019-03-26 09:31:27 +00:00
'chromium-browser',
'chromium',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
2020-06-26 01:30:29 +00:00
'chrome',
2019-03-26 09:31:27 +00:00
'google-chrome',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'google-chrome-stable',
'google-chrome-beta',
'google-chrome-canary',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
'google-chrome-unstable',
'google-chrome-dev',
)
for name in default_executable_paths:
full_path_exists = shutil.which(name)
if full_path_exists:
return name
return None
2019-03-26 09:31:27 +00:00
def find_chrome_data_dir() -> Optional[str]:
"""find any installed chrome user data directories in the default locations"""
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
# Going forward we want to discourage people from using their main chrome profile for archiving.
# Session tokens, personal data, and cookies are often returned in server responses,
# when they get archived, they are essentially burned as anyone who can view the archive
# can use that data to masquerade as the logged-in user that did the archiving.
# For this reason users should always create dedicated burner profiles for archiving and not use
# their daily driver main accounts.
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# # make sure data dir finding precedence order always matches binary finding order
# default_profile_paths = (
# '~/.config/chromium',
# '~/Library/Application Support/Chromium',
# '~/AppData/Local/Chromium/User Data',
# '~/.config/chrome',
# '~/.config/google-chrome',
# '~/Library/Application Support/Google/Chrome',
# '~/AppData/Local/Google/Chrome/User Data',
# '~/.config/google-chrome-stable',
# '~/.config/google-chrome-beta',
# '~/Library/Application Support/Google/Chrome Canary',
# '~/AppData/Local/Google/Chrome SxS/User Data',
# '~/.config/google-chrome-unstable',
# '~/.config/google-chrome-dev',
# )
# for path in default_profile_paths:
# full_path = Path(path).resolve()
# if full_path.exists():
# return full_path
2019-03-26 09:31:27 +00:00
return None
def wget_supports_compression(config):
try:
cmd = [
config['WGET_BINARY'],
"--compression=auto",
"--help",
]
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
except (FileNotFoundError, OSError):
return False
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
return {
'PACKAGE_DIR': {
'path': (config['PACKAGE_DIR']).resolve(),
'enabled': True,
'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
},
'TEMPLATES_DIR': {
'path': (config['TEMPLATES_DIR']).resolve(),
'enabled': True,
2021-01-30 10:34:19 +00:00
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
},
2024-09-21 08:53:59 +00:00
'LIB_DIR': {
'path': (config['LIB_DIR']).resolve(),
'enabled': True,
'is_valid': config['LIB_DIR'].is_dir(),
},
# 'NODE_MODULES_DIR': {
# 'path': ,
# 'enabled': ,
# 'is_valid': (...).exists(),
# },
}
def get_data_locations(config: ConfigDict) -> ConfigValue:
return {
# OLD: migrating to personas
# 'CHROME_USER_DATA_DIR': {
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
# },
# 'COOKIES_FILE': {
# 'path': os.path.abspath(config['COOKIES_FILE']),
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
# },
2024-09-21 08:53:59 +00:00
"OUTPUT_DIR": {
"path": config["OUTPUT_DIR"].resolve(),
"enabled": True,
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
"is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()),
},
2024-09-21 08:53:59 +00:00
"CONFIG_FILE": {
"path": config["CONFIG_FILE"].resolve(),
"enabled": True,
"is_valid": config["CONFIG_FILE"].exists(),
},
2024-09-21 08:53:59 +00:00
"SQL_INDEX": {
"path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(),
"enabled": True,
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
"is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()),
},
2024-09-21 08:53:59 +00:00
"ARCHIVE_DIR": {
"path": config["ARCHIVE_DIR"].resolve(),
"enabled": True,
"is_valid": config["ARCHIVE_DIR"].exists(),
"is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()),
},
2024-09-21 08:53:59 +00:00
"SOURCES_DIR": {
"path": config["SOURCES_DIR"].resolve(),
"enabled": True,
"is_valid": config["SOURCES_DIR"].exists(),
},
2024-09-21 08:53:59 +00:00
"PERSONAS_DIR": {
"path": config["PERSONAS_DIR"].resolve(),
"enabled": True,
"is_valid": config["PERSONAS_DIR"].exists(),
2024-08-18 02:38:51 +00:00
},
2024-09-21 08:53:59 +00:00
"LOGS_DIR": {
"path": config["LOGS_DIR"].resolve(),
"enabled": True,
"is_valid": config["LOGS_DIR"].exists(),
},
2024-09-21 08:53:59 +00:00
"CACHE_DIR": {
"path": config["CACHE_DIR"].resolve(),
"enabled": True,
"is_valid": config["CACHE_DIR"].exists(),
2024-05-12 08:42:34 +00:00
},
2024-09-21 08:53:59 +00:00
"CUSTOM_TEMPLATES_DIR": {
"path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(),
"enabled": bool(config["CUSTOM_TEMPLATES_DIR"]),
"is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(),
},
# managed by bin/docker_entrypoint.sh and python-crontab:
# 'CRONTABS_DIR': {
# 'path': config['CRONTABS_DIR'].resolve(),
# 'enabled': True,
# 'is_valid': config['CRONTABS_DIR'].exists(),
# },
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
return {
'PYTHON_BINARY': {
'path': bin_path(config['PYTHON_BINARY']),
'version': config['PYTHON_VERSION'],
'hash': bin_hash(config['PYTHON_BINARY']),
'enabled': True,
2021-01-20 15:24:34 +00:00
'is_valid': bool(config['PYTHON_VERSION']),
},
2022-06-09 03:17:38 +00:00
'SQLITE_BINARY': {
'path': bin_path(config['SQLITE_BINARY']),
'version': config['SQLITE_VERSION'],
'hash': bin_hash(config['SQLITE_BINARY']),
'enabled': True,
'is_valid': bool(config['SQLITE_VERSION']),
},
'DJANGO_BINARY': {
'path': bin_path(config['DJANGO_BINARY']),
'version': config['DJANGO_VERSION'],
'hash': bin_hash(config['DJANGO_BINARY']),
'enabled': True,
'is_valid': bool(config['DJANGO_VERSION']),
},
2022-06-09 03:17:38 +00:00
'ARCHIVEBOX_BINARY': {
'path': bin_path(config['ARCHIVEBOX_BINARY']),
'version': config['VERSION'],
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
'enabled': True,
2022-06-09 03:17:38 +00:00
'is_valid': True,
},
2022-06-09 03:17:38 +00:00
'CURL_BINARY': {
'path': bin_path(config['CURL_BINARY']),
'version': config['CURL_VERSION'],
2021-01-20 15:24:34 +00:00
'hash': bin_hash(config['CURL_BINARY']),
'enabled': config['USE_CURL'],
'is_valid': bool(config['CURL_VERSION']),
},
'WGET_BINARY': {
'path': bin_path(config['WGET_BINARY']),
'version': config['WGET_VERSION'],
'hash': bin_hash(config['WGET_BINARY']),
'enabled': config['USE_WGET'],
'is_valid': bool(config['WGET_VERSION']),
},
'NODE_BINARY': {
'path': bin_path(config['NODE_BINARY']),
'version': config['NODE_VERSION'],
'hash': bin_hash(config['NODE_BINARY']),
'enabled': config['USE_NODE'],
2021-01-20 15:24:34 +00:00
'is_valid': bool(config['NODE_VERSION']),
},
'SINGLEFILE_BINARY': {
'path': bin_path(config['SINGLEFILE_BINARY']),
'version': config['SINGLEFILE_VERSION'],
'hash': bin_hash(config['SINGLEFILE_BINARY']),
'enabled': config['USE_SINGLEFILE'],
'is_valid': bool(config['SINGLEFILE_VERSION']),
},
'READABILITY_BINARY': {
'path': bin_path(config['READABILITY_BINARY']),
'version': config['READABILITY_VERSION'],
'hash': bin_hash(config['READABILITY_BINARY']),
'enabled': config['USE_READABILITY'],
'is_valid': bool(config['READABILITY_VERSION']),
},
2020-09-22 08:46:21 +00:00
'MERCURY_BINARY': {
'path': bin_path(config['MERCURY_BINARY']),
'version': config['MERCURY_VERSION'],
'hash': bin_hash(config['MERCURY_BINARY']),
'enabled': config['USE_MERCURY'],
'is_valid': bool(config['MERCURY_VERSION']),
},
'GIT_BINARY': {
'path': bin_path(config['GIT_BINARY']),
'version': config['GIT_VERSION'],
'hash': bin_hash(config['GIT_BINARY']),
'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']),
},
'YOUTUBEDL_BINARY': {
'path': bin_path(config['YOUTUBEDL_BINARY']),
'version': config['YOUTUBEDL_VERSION'],
'hash': bin_hash(config['YOUTUBEDL_BINARY']),
'enabled': config['USE_YOUTUBEDL'],
'is_valid': bool(config['YOUTUBEDL_VERSION']),
},
'CHROME_BINARY': {
'path': bin_path(config['CHROME_BINARY']),
'version': config['CHROME_VERSION'],
'hash': bin_hash(config['CHROME_BINARY']),
'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']),
},
2020-12-12 12:36:31 +00:00
'RIPGREP_BINARY': {
'path': bin_path(config['RIPGREP_BINARY']),
'version': config['RIPGREP_VERSION'],
'hash': bin_hash(config['RIPGREP_BINARY']),
'enabled': config['USE_RIPGREP'],
'is_valid': bool(config['RIPGREP_VERSION']),
},
2020-12-20 01:11:19 +00:00
# TODO: add an entry for the sonic search backend?
# 'SONIC_BINARY': {
# 'path': bin_path(config['SONIC_BINARY']),
# 'version': config['SONIC_VERSION'],
# 'hash': bin_hash(config['SONIC_BINARY']),
# 'enabled': config['USE_SONIC'],
# 'is_valid': bool(config['SONIC_VERSION']),
# },
}
def get_chrome_info(config: ConfigDict) -> ConfigValue:
return {
'TIMEOUT': config['TIMEOUT'],
'RESOLUTION': config['RESOLUTION'],
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
2023-03-15 23:01:02 +00:00
'CHROME_TIMEOUT': config['CHROME_TIMEOUT'],
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
2019-03-26 09:31:27 +00:00
}
2019-03-23 03:00:53 +00:00
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************** Load Config *********************************
# ******* (compile the defaults, configs, and metadata all into CONFIG) ********
# ******************************************************************************
# ******************************************************************************
def load_all_config():
2024-08-21 01:31:21 +00:00
CONFIG: ConfigDict = ConfigDict()
2020-12-20 01:11:19 +00:00
for section_name, section_config in CONFIG_SCHEMA.items():
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG SECTION:', section_name)
CONFIG = load_config(section_config, CONFIG)
2024-09-21 08:53:59 +00:00
# print("LOADING CONFIG SECTION:", 'DYNAMIC')
2020-12-20 01:11:19 +00:00
return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG)
2020-12-20 01:11:19 +00:00
# add all final config values in CONFIG to globals in this file
2024-08-21 01:31:21 +00:00
CONFIG: ConfigDict = load_all_config()
globals().update(CONFIG)
2020-12-20 01:11:19 +00:00
# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
2024-09-21 08:53:59 +00:00
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
########################### System Environment Setup ###########################
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
2024-01-04 19:41:12 +00:00
assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # noqa: F821
os.environ["TZ"] = TIMEZONE # noqa: F821
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
2020-08-11 03:21:02 +00:00
2020-08-18 22:14:56 +00:00
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
sys.path.append(CONFIG.NODE_BIN_PATH)
2020-08-18 22:14:56 +00:00
2022-06-09 03:12:55 +00:00
# OPTIONAL: also look around the host system for node modules to use
# avoid enabling this unless absolutely needed,
# having overlapping potential sources of libs is a big source of bugs/confusing to users
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
# sys.path.append(DEV_NODE_BIN_PATH)
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
# sys.path.append(USER_NODE_BIN_PATH)
2021-02-01 07:27:24 +00:00
# disable stderr "you really shouldnt disable ssl" warnings with library config
if not CONFIG['CHECK_SSL_VALIDITY']:
import urllib3
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
2020-12-20 01:11:19 +00:00
2022-06-09 01:58:15 +00:00
# get SQLite database version, compile options, and runtime options
2022-06-09 03:12:55 +00:00
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
#cursor = sqlite3.connect(':memory:').cursor()
2022-06-09 02:09:11 +00:00
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
2022-06-09 03:12:55 +00:00
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
#cursor.close()
2020-12-20 01:11:19 +00:00
########################### Config Validity Checkers ###########################
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
def bump_startup_progress_bar():
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
check_system_config(config)
output_dir = out_dir or Path(config['OUTPUT_DIR'])
assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
bump_startup_progress_bar()
try:
from django.core.management import call_command
sys.path.append(str(config['PACKAGE_DIR']))
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
# Check to make sure JSON extension is available in our Sqlite3 instance
try:
cursor = sqlite3.connect(':memory:').cursor()
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
except sqlite3.OperationalError as exc:
stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
hint([
'Upgrade your Python version or install the extension manually:',
'https://code.djangoproject.com/wiki/JSON1Extension'
])
bump_startup_progress_bar()
if in_memory_db:
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
django.setup()
bump_startup_progress_bar()
call_command("migrate", interactive=False, verbosity=0)
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django.setup()
bump_startup_progress_bar()
from django.conf import settings
2021-02-16 09:15:09 +00:00
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
if check_db:
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor.execute("PRAGMA busy_timeout = 5000;")
cursor.execute("PRAGMA synchronous = NORMAL;")
# Create cache table in DB if needed
try:
from django.core.cache import cache
cache.get('test', None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
2021-03-01 03:53:34 +00:00
bump_startup_progress_bar()
2021-03-01 03:53:34 +00:00
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
bump_startup_progress_bar()
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
if settings.DEBUG_LOGFIRE:
from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
SQLite3Instrumentor().instrument()
import logfire
logfire.configure()
logfire.instrument_django(is_sql_commentor_enabled=True)
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
except KeyboardInterrupt:
raise SystemExit(2)
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = None