2024-09-30 22:59:05 +00:00
|
|
|
__package__ = 'archivebox.config'
|
2024-09-25 12:10:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
import os
|
2024-09-26 09:36:59 +00:00
|
|
|
import re
|
2024-10-04 07:08:44 +00:00
|
|
|
import platform
|
2024-10-04 10:24:15 +00:00
|
|
|
import machineid
|
|
|
|
import tempfile
|
2024-10-04 07:08:44 +00:00
|
|
|
|
2024-09-26 09:36:59 +00:00
|
|
|
from typing import Dict
|
2024-09-25 12:10:09 +00:00
|
|
|
from pathlib import Path
|
2024-09-30 22:59:05 +00:00
|
|
|
import importlib.metadata
|
2024-10-01 00:13:55 +00:00
|
|
|
from collections.abc import Mapping
|
2024-09-25 12:10:09 +00:00
|
|
|
|
|
|
|
from benedict import benedict
|
|
|
|
|
2024-09-30 22:59:05 +00:00
|
|
|
from ..misc.logging import DEFAULT_CLI_COLORS
|
2024-09-25 12:10:09 +00:00
|
|
|
|
|
|
|
###################### Config ##########################
|
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
2024-10-04 08:40:41 +00:00
|
|
|
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
|
2024-09-30 23:50:36 +00:00
|
|
|
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
2024-09-30 22:59:05 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
def _detect_installed_version(PACKAGE_DIR: Path):
|
2024-10-01 01:12:48 +00:00
|
|
|
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
|
2024-09-30 22:59:05 +00:00
|
|
|
try:
|
2024-10-01 01:12:48 +00:00
|
|
|
# if in production install, use pip-installed package metadata
|
2024-09-30 22:59:05 +00:00
|
|
|
return importlib.metadata.version(__package__ or 'archivebox')
|
|
|
|
except importlib.metadata.PackageNotFoundError:
|
2024-10-01 01:12:48 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
try:
|
|
|
|
# if in dev Git repo dir, use pyproject.toml file
|
|
|
|
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
|
|
|
|
for line in pyproject_config:
|
|
|
|
if line.startswith('version = '):
|
|
|
|
return line.split(' = ', 1)[-1].strip('"')
|
|
|
|
except FileNotFoundError:
|
|
|
|
# building docs, pyproject.toml is not available
|
|
|
|
pass
|
|
|
|
|
|
|
|
try:
|
|
|
|
# if in dev but not in Git repo dir, fallback to using package.json file
|
|
|
|
package_json = (PACKAGE_DIR / 'package.json').read_text().split('\n')
|
|
|
|
for line in package_json:
|
|
|
|
if '"version": "' in line:
|
|
|
|
return line.replace('"', '').split(':')[-1].strip(',')
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
# raise Exception('Failed to detect installed archivebox version!')
|
|
|
|
return 'dev'
|
2024-09-30 22:59:05 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
VERSION: str = _detect_installed_version(PACKAGE_DIR)
|
2024-09-30 22:59:05 +00:00
|
|
|
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-10-04 10:24:15 +00:00
|
|
|
|
|
|
|
|
2024-10-01 00:13:55 +00:00
|
|
|
class ConstantsDict(Mapping):
|
2024-10-04 07:08:44 +00:00
|
|
|
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'yes')
|
|
|
|
OS = platform.system().lower() # darwin, linux, etc.
|
|
|
|
ARCH = platform.machine().lower() # arm64, x86_64, etc.
|
|
|
|
LIB_DIR_SCOPE = f'{ARCH}-{OS}' + ('-docker' if IN_DOCKER else '')
|
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
PACKAGE_DIR: Path = PACKAGE_DIR # archivebox source code dir
|
|
|
|
DATA_DIR: Path = DATA_DIR # archivebox user data dir
|
|
|
|
ARCHIVE_DIR: Path = ARCHIVE_DIR # archivebox snapshot data dir
|
|
|
|
VERSION: str = VERSION
|
|
|
|
|
|
|
|
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
|
|
|
|
TEMPLATES_DIR_NAME: str = 'templates'
|
|
|
|
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
|
|
|
|
STATIC_DIR: Path = TEMPLATES_DIR / 'static'
|
|
|
|
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
|
|
|
|
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
ARCHIVE_DIR_NAME: str = 'archive'
|
|
|
|
SOURCES_DIR_NAME: str = 'sources'
|
|
|
|
PERSONAS_DIR_NAME: str = 'personas'
|
|
|
|
CRONTABS_DIR_NAME: str = 'crontabs'
|
|
|
|
CACHE_DIR_NAME: str = 'cache'
|
|
|
|
LOGS_DIR_NAME: str = 'logs'
|
|
|
|
LIB_DIR_NAME: str = 'lib'
|
|
|
|
TMP_DIR_NAME: str = 'tmp'
|
2024-09-27 07:38:37 +00:00
|
|
|
|
2024-10-05 04:03:02 +00:00
|
|
|
SYSTEM_TMP_DIR: Path = Path(os.environ['SYSTEM_TMP_DIR']) if 'SYSTEM_TMP_DIR' in os.environ else (Path(tempfile.gettempdir()) / 'archivebox')
|
|
|
|
DATA_DIR_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16]
|
|
|
|
SYSTEM_LIB_DIR: Path = Path(os.environ['SYSTEM_LIB_DIR']) if 'SYSTEM_LIB_DIR' in os.environ else (PACKAGE_DIR / LIB_DIR_NAME)
|
|
|
|
DATA_DIR_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE
|
2024-10-04 10:24:15 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
|
|
|
|
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
|
|
|
|
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
|
|
|
|
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
|
|
|
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
|
2024-10-05 04:03:02 +00:00
|
|
|
LIB_DIR: Path = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR # e.g. /app/lib or ./data/lib/arm64-darwin-docker
|
|
|
|
TMP_DIR: Path = SYSTEM_TMP_DIR if IN_DOCKER else DATA_DIR_TMP_DIR # e.g. /tmp/archivebox or ./data/tmp/abcwe324234
|
2024-09-30 23:50:36 +00:00
|
|
|
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
|
|
|
USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
|
|
|
|
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
|
|
|
|
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
|
|
|
|
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
|
|
|
|
BIN_DIR: Path = LIB_BIN_DIR
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
|
|
|
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
2024-10-05 02:38:36 +00:00
|
|
|
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
|
|
|
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
2024-10-05 02:38:36 +00:00
|
|
|
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
JSON_INDEX_FILENAME: str = 'index.json'
|
|
|
|
HTML_INDEX_FILENAME: str = 'index.html'
|
|
|
|
ROBOTS_TXT_FILENAME: str = 'robots.txt'
|
|
|
|
FAVICON_FILENAME: str = 'favicon.ico'
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
TIMEZONE: str = 'UTC'
|
|
|
|
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
|
|
|
|
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
|
2024-09-27 07:38:37 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
|
|
|
|
# 99.999% of the time, URLs ending in these extensions are static files
|
|
|
|
# that can be downloaded as-is, not html pages that need to be rendered
|
|
|
|
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
|
|
|
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
|
|
|
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
|
|
|
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
|
|
|
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
|
|
|
'atom', 'rss', 'css', 'js', 'json',
|
|
|
|
'dmg', 'iso', 'img',
|
|
|
|
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
# Less common extensions to consider adding later
|
|
|
|
# jar, swf, bin, com, exe, dll, deb
|
|
|
|
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
|
|
|
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
|
|
|
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
# These are always treated as pages, not as static files, never add them:
|
|
|
|
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
|
|
|
))
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
INGORED_PATHS: frozenset[str] = frozenset((
|
|
|
|
".git",
|
|
|
|
".svn",
|
|
|
|
".DS_Store",
|
|
|
|
".gitignore",
|
|
|
|
"lost+found",
|
|
|
|
".DS_Store",
|
|
|
|
".env",
|
|
|
|
"Dockerfile",
|
|
|
|
))
|
|
|
|
PIP_RELATED_NAMES: frozenset[str] = frozenset((
|
|
|
|
".venv",
|
|
|
|
"venv",
|
|
|
|
"virtualenv",
|
|
|
|
".virtualenv",
|
|
|
|
))
|
|
|
|
NPM_RELATED_NAMES: frozenset[str] = frozenset((
|
|
|
|
"node_modules",
|
|
|
|
"package.json",
|
|
|
|
"package-lock.json",
|
|
|
|
"yarn.lock",
|
|
|
|
))
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
DATA_DIR_NAMES: frozenset[str] = frozenset((
|
|
|
|
ARCHIVE_DIR_NAME,
|
|
|
|
SOURCES_DIR_NAME,
|
|
|
|
LOGS_DIR_NAME,
|
|
|
|
CACHE_DIR_NAME,
|
|
|
|
LIB_DIR_NAME,
|
2024-10-05 02:38:36 +00:00
|
|
|
TMP_DIR_NAME,
|
2024-09-30 23:50:36 +00:00
|
|
|
PERSONAS_DIR_NAME,
|
|
|
|
CUSTOM_TEMPLATES_DIR_NAME,
|
|
|
|
USER_PLUGINS_DIR_NAME,
|
2024-10-05 02:38:36 +00:00
|
|
|
CRONTABS_DIR_NAME,
|
2024-09-30 23:50:36 +00:00
|
|
|
))
|
|
|
|
DATA_DIRS: frozenset[Path] = frozenset(DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
|
|
|
|
DATA_FILE_NAMES: frozenset[str] = frozenset((
|
|
|
|
CONFIG_FILENAME,
|
|
|
|
SQL_INDEX_FILENAME,
|
|
|
|
f"{SQL_INDEX_FILENAME}-wal",
|
|
|
|
f"{SQL_INDEX_FILENAME}-shm",
|
2024-10-05 02:38:36 +00:00
|
|
|
QUEUE_DATABASE_FILENAME,
|
|
|
|
f"{QUEUE_DATABASE_FILENAME}-wal",
|
|
|
|
f"{QUEUE_DATABASE_FILENAME}-shm",
|
2024-09-30 23:50:36 +00:00
|
|
|
"search.sqlite3",
|
|
|
|
JSON_INDEX_FILENAME,
|
|
|
|
HTML_INDEX_FILENAME,
|
|
|
|
ROBOTS_TXT_FILENAME,
|
|
|
|
FAVICON_FILENAME,
|
|
|
|
CONFIG_FILENAME,
|
|
|
|
f"{CONFIG_FILENAME}.bak",
|
|
|
|
"static_index.json",
|
|
|
|
))
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
# When initializing archivebox in a new directory, we check to make sure the dir is
|
|
|
|
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
|
|
|
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
|
|
|
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
2024-10-01 00:25:15 +00:00
|
|
|
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
|
2024-09-30 23:50:36 +00:00
|
|
|
*INGORED_PATHS,
|
|
|
|
*PIP_RELATED_NAMES,
|
|
|
|
*NPM_RELATED_NAMES,
|
|
|
|
*DATA_DIR_NAMES,
|
|
|
|
*DATA_FILE_NAMES,
|
|
|
|
"static", # created by old static exports <v0.6.0
|
|
|
|
"sonic", # created by docker bind mount
|
|
|
|
))
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-09-30 23:50:36 +00:00
|
|
|
CODE_LOCATIONS = benedict({
|
|
|
|
'PACKAGE_DIR': {
|
|
|
|
'path': (PACKAGE_DIR).resolve(),
|
|
|
|
'enabled': True,
|
|
|
|
'is_valid': (PACKAGE_DIR / '__main__.py').exists(),
|
|
|
|
},
|
|
|
|
'TEMPLATES_DIR': {
|
|
|
|
'path': TEMPLATES_DIR.resolve(),
|
|
|
|
'enabled': True,
|
|
|
|
'is_valid': STATIC_DIR.exists(),
|
|
|
|
},
|
2024-10-01 06:52:00 +00:00
|
|
|
'LIB_DIR': {
|
|
|
|
'path': LIB_DIR.resolve(),
|
|
|
|
'enabled': True,
|
|
|
|
'is_valid': LIB_DIR.is_dir(),
|
|
|
|
},
|
2024-09-30 23:50:36 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
DATA_LOCATIONS = benedict({
|
2024-10-01 00:25:15 +00:00
|
|
|
"DATA_DIR": {
|
2024-09-30 23:50:36 +00:00
|
|
|
"path": DATA_DIR.resolve(),
|
|
|
|
"enabled": True,
|
|
|
|
"is_valid": DATABASE_FILE.exists(),
|
|
|
|
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
|
|
|
},
|
|
|
|
"CONFIG_FILE": {
|
|
|
|
"path": CONFIG_FILE.resolve(),
|
|
|
|
"enabled": True,
|
|
|
|
"is_valid": CONFIG_FILE.exists(),
|
|
|
|
},
|
|
|
|
"SQL_INDEX": {
|
|
|
|
"path": DATABASE_FILE.resolve(),
|
|
|
|
"enabled": True,
|
|
|
|
"is_valid": DATABASE_FILE.exists(),
|
|
|
|
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
|
|
|
},
|
|
|
|
"QUEUE_DATABASE": {
|
|
|
|
"path": QUEUE_DATABASE_FILE.resolve(),
|
|
|
|
"enabled": True,
|
|
|
|
"is_valid": QUEUE_DATABASE_FILE.exists(),
|
|
|
|
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
|
|
|
|
},
|
|
|
|
"ARCHIVE_DIR": {
|
|
|
|
"path": ARCHIVE_DIR.resolve(),
|
|
|
|
"enabled": True,
|
|
|
|
"is_valid": ARCHIVE_DIR.exists(),
|
|
|
|
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
|
|
|
},
|
|
|
|
"SOURCES_DIR": {
|
|
|
|
"path": SOURCES_DIR.resolve(),
|
|
|
|
"enabled": True,
|
|
|
|
"is_valid": SOURCES_DIR.exists(),
|
|
|
|
},
|
|
|
|
"LOGS_DIR": {
|
|
|
|
"path": LOGS_DIR.resolve(),
|
|
|
|
"enabled": True,
|
|
|
|
"is_valid": LOGS_DIR.is_dir(),
|
|
|
|
},
|
2024-10-05 04:34:19 +00:00
|
|
|
# "CACHE_DIR": {
|
|
|
|
# "path": CACHE_DIR.resolve(),
|
|
|
|
# "enabled": True,
|
|
|
|
# "is_valid": CACHE_DIR.is_dir(),
|
|
|
|
# },
|
|
|
|
'TMP_DIR': {
|
|
|
|
'path': TMP_DIR.resolve(),
|
|
|
|
'enabled': True,
|
|
|
|
'is_valid': TMP_DIR.is_dir(),
|
|
|
|
},
|
|
|
|
"PERSONAS_DIR": {
|
|
|
|
"path": PERSONAS_DIR.resolve(),
|
2024-10-05 04:40:36 +00:00
|
|
|
"enabled": PERSONAS_DIR.exists(),
|
2024-10-05 04:34:19 +00:00
|
|
|
"is_valid": PERSONAS_DIR.is_dir(),
|
|
|
|
},
|
|
|
|
'CUSTOM_TEMPLATES_DIR': {
|
|
|
|
'path': CUSTOM_TEMPLATES_DIR.resolve(),
|
2024-10-05 04:40:36 +00:00
|
|
|
'enabled': CUSTOM_TEMPLATES_DIR.exists(),
|
2024-10-05 04:34:19 +00:00
|
|
|
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
|
|
|
|
},
|
|
|
|
'USER_PLUGINS_DIR': {
|
|
|
|
'path': USER_PLUGINS_DIR.resolve(),
|
2024-10-05 04:40:36 +00:00
|
|
|
'enabled': USER_PLUGINS_DIR.exists(),
|
2024-10-05 04:34:19 +00:00
|
|
|
'is_valid': USER_PLUGINS_DIR.is_dir(),
|
2024-09-30 23:50:36 +00:00
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2024-10-01 00:13:55 +00:00
|
|
|
@classmethod
|
|
|
|
def __getitem__(cls, key: str):
|
|
|
|
return getattr(cls, key)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def __benedict__(cls):
|
|
|
|
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def __len__(cls):
|
|
|
|
return len(cls.__benedict__())
|
2024-09-26 09:36:59 +00:00
|
|
|
|
2024-10-01 00:13:55 +00:00
|
|
|
@classmethod
|
|
|
|
def __iter__(cls):
|
|
|
|
return iter(cls.__benedict__())
|
2024-09-25 12:10:09 +00:00
|
|
|
|
2024-10-01 00:13:55 +00:00
|
|
|
CONSTANTS = ConstantsDict()
|
|
|
|
CONSTANTS_CONFIG = CONSTANTS.__benedict__()
|
2024-09-25 12:10:09 +00:00
|
|
|
|
2024-10-01 00:13:55 +00:00
|
|
|
# add all key: values to globals() for easier importing
|
|
|
|
globals().update(CONSTANTS)
|
2024-10-05 04:03:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
# these need to always exist as we need them to run almost everything
|
|
|
|
CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)
|