fix LIB_DIR and TMP_DIR loading when primary option isnt available

This commit is contained in:
Nick Sweeting 2024-10-21 00:35:25 -07:00
parent deb116eed4
commit a211461ffc
No known key found for this signature in database
21 changed files with 712 additions and 303 deletions

View file

@ -37,7 +37,8 @@ class BaseBinary(Binary):
@staticmethod @staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None: def symlink_to_lib(binary, bin_dir=None) -> None:
bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR from archivebox.config.common import STORAGE_CONFIG
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
if not (binary.abspath and os.access(binary.abspath, os.R_OK)): if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
return return
@ -55,9 +56,10 @@ class BaseBinary(Binary):
@validate_call @validate_call
def load(self, fresh=False, **kwargs) -> Self: def load(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
if fresh: if fresh:
binary = super().load(**kwargs) binary = super().load(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
else: else:
# get cached binary from db # get cached binary from db
try: try:
@ -72,16 +74,18 @@ class BaseBinary(Binary):
@validate_call @validate_call
def install(self, **kwargs) -> Self: def install(self, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
binary = super().install(**kwargs) binary = super().install(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary return binary
@validate_call @validate_call
def load_or_install(self, fresh=False, **kwargs) -> Self: def load_or_install(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
try: try:
binary = self.load(fresh=fresh) binary = self.load(fresh=fresh)
if binary and binary.version: if binary and binary.version:
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary return binary
except Exception: except Exception:
pass pass

View file

@ -1,8 +1,13 @@
__package__ = 'abx.archivebox' __package__ = 'abx.archivebox'
import os import os
import sys
import re
from pathlib import Path from pathlib import Path
from typing import Type, Tuple, Callable, ClassVar from typing import Type, Tuple, Callable, ClassVar, Dict, Any
import toml
from rich import print
from benedict import benedict from benedict import benedict
from pydantic import model_validator, TypeAdapter from pydantic import model_validator, TypeAdapter
@ -18,6 +23,11 @@ from . import toml_util
PACKAGE_DIR = Path(__file__).resolve().parent.parent PACKAGE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.getcwd()).resolve() DATA_DIR = Path(os.getcwd()).resolve()
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
AUTOFIXES_HEADER = "[AUTOFIXES]"
AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:"
class FlatTomlConfigSettingsSource(TomlConfigSettingsSource): class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
@ -53,7 +63,7 @@ class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data) super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data)
class ArchiveBoxBaseConfig(BaseSettings): class BaseConfigSet(BaseSettings):
""" """
This is the base class for an ArchiveBox ConfigSet. This is the base class for an ArchiveBox ConfigSet.
It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables. It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables.
@ -83,7 +93,7 @@ class ArchiveBoxBaseConfig(BaseSettings):
loc_by_alias=False, loc_by_alias=False,
validate_assignment=True, validate_assignment=True,
validate_return=True, validate_return=True,
revalidate_instances="always", revalidate_instances="subclass-instances",
) )
load_from_defaults: ClassVar[bool] = True load_from_defaults: ClassVar[bool] = True
@ -101,9 +111,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
) -> Tuple[PydanticBaseSettingsSource, ...]: ) -> Tuple[PydanticBaseSettingsSource, ...]:
"""Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables""" """Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables"""
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
# import ipdb; ipdb.set_trace() # import ipdb; ipdb.set_trace()
precedence_order = {} precedence_order = {}
@ -152,27 +159,36 @@ class ArchiveBoxBaseConfig(BaseSettings):
def fill_defaults(self): def fill_defaults(self):
"""Populate any unset values using function provided as their default""" """Populate any unset values using function provided as their default"""
for key, field in self.model_fields.items(): for key in self.model_fields.keys():
value = getattr(self, key) if isinstance(getattr(self, key), Callable):
if self.load_from_defaults:
if isinstance(value, Callable): computed_default = self.get_default_value(key)
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected # set generated default value as final validated value
if func_takes_args_or_kwargs(value): setattr(self, key, computed_default)
# assemble dict of existing field values to pass to default factory functions
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
computed_default = field.default(config_so_far)
else:
# otherwise it's a pure function with no args, just call it
computed_default = field.default()
# coerce/check to make sure default factory return value matches type annotation
TypeAdapter(field.annotation).validate_python(computed_default)
# set generated default value as final validated value
setattr(self, key, computed_default)
return self return self
def update_in_place(self, warn=True, **kwargs): def get_default_value(self, key):
"""Get the default value for a given config key"""
field = self.model_fields[key]
value = getattr(self, key)
if isinstance(value, Callable):
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
if func_takes_args_or_kwargs(value):
# assemble dict of existing field values to pass to default factory functions
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
computed_default = field.default(config_so_far)
else:
# otherwise it's a pure function with no args, just call it
computed_default = field.default()
# coerce/check to make sure default factory return value matches type annotation
TypeAdapter(field.annotation).validate_python(computed_default)
return computed_default
return value
def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
""" """
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime. Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
@ -180,25 +196,106 @@ class ArchiveBoxBaseConfig(BaseSettings):
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it. Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue. SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
""" """
from archivebox.misc.toml_util import CustomTOMLEncoder
if warn: if warn:
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:') fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
print(f'[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
# set the new values in the environment
for key, value in kwargs.items(): for key, value in kwargs.items():
os.environ[key] = str(value) os.environ[key] = str(value)
original_value = getattr(self, key) original_value = getattr(self, key)
if warn: if warn:
print(f' {key}={original_value} -> {value}') print(f' {key}={original_value} -> {value}')
# if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section
try:
if persist and ARCHIVEBOX_CONFIG_FILE.is_file():
autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder())
existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip()
if AUTOFIXES_HEADER in existing_config:
existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip()
else:
existing_autofixes = ''
new_config = '\n'.join(line for line in [
existing_config,
'\n' + AUTOFIXES_HEADER,
AUTOFIXES_SUBHEADER,
existing_autofixes,
autofixes_to_add,
] if line.strip()).strip() + '\n'
ARCHIVEBOX_CONFIG_FILE.write_text(new_config)
except Exception:
pass
self.__init__() self.__init__()
if warn:
print(file=sys.stderr)
return self return self
def as_legacy_config_schema(self): @property
def toml_section_header(self):
"""Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
class_name = self.__class__.__name__
return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_')
def from_defaults(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the default values"""
class OnlyDefaultsConfig(self.__class__):
load_from_defaults = True
load_from_configfile = False
load_from_environment = False
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_configfile(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
class OnlyConfigFileConfig(self.__class__):
load_from_defaults = False
load_from_configfile = True
load_from_environment = False
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_environment(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the environment variables"""
class OnlyEnvironmentConfig(self.__class__):
load_from_defaults = False
load_from_configfile = False
load_from_environment = True
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_computed(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the computed fields"""
return benedict(self.model_dump(include=set(self.model_computed_fields.keys())))
def to_toml_dict(self, defaults=False) -> Dict[str, Any]:
"""Get the current config as a TOML-ready dict"""
config_dict = {}
for key, value in benedict(self).items():
if defaults or value != self.get_default_value(key):
config_dict[key] = value
return benedict({self.toml_section_header: config_dict})
def to_toml_str(self, defaults=False) -> str:
"""Get the current config as a TOML string"""
from archivebox.misc.toml_util import CustomTOMLEncoder
toml_dict = self.to_toml_dict(defaults=defaults)
if not toml_dict[self.toml_section_header]:
# if the section is empty, don't write it
toml_dict.pop(self.toml_section_header)
return toml.dumps(toml_dict, encoder=CustomTOMLEncoder())
def as_legacy_config_schema(self) -> Dict[str, Any]:
# shim for backwards compatibility with old config schema style # shim for backwards compatibility with old config schema style
model_values = self.model_dump() model_values = self.model_dump()
return benedict({ return benedict({
key: {'type': field.annotation, 'default': model_values[key]} key: {'type': field.annotation, 'default': model_values[key]}
for key, field in self.model_fields.items() for key, field in self.model_fields.items()
}) })
class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg]
pass

View file

@ -18,13 +18,7 @@ def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
def get_CONFIG() -> Dict[str, BaseConfigSet]: def get_CONFIG() -> Dict[str, BaseConfigSet]:
return {} return {}
@hookspec
def get_BINARIES() -> Dict[str, BaseBinary]:
return {}
@hookspec
def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
return {}
@hookspec @hookspec
def get_EXTRACTORS() -> Dict[str, BaseExtractor]: def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
@ -45,3 +39,14 @@ def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
# @hookspec # @hookspec
# def get_QUEUES(): # def get_QUEUES():
# return {} # return {}
##############################################################
# provided by abx.pydantic_pkgr.hookspec:
# @hookspec
# def get_BINARIES() -> Dict[str, BaseBinary]:
# return {}
# @hookspec
# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
# return {}

View file

@ -131,9 +131,12 @@ def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None): def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
"""Get all the relevant config for the given scope, in correct precedence order""" """Get all the relevant config for the given scope, in correct precedence order"""
from django.conf import settings
default_config: benedict = defaults or settings.CONFIG
snapshot = snapshot or (archiveresult and archiveresult.snapshot) snapshot = snapshot or (archiveresult and archiveresult.snapshot)
crawl = crawl or (snapshot and snapshot.crawl) crawl = crawl or (snapshot and snapshot.crawl)
seed = seed or (crawl and crawl.seed) seed = seed or (crawl and crawl.seed)
@ -147,7 +150,7 @@ def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=No
extra_config = extra_config or {} extra_config = extra_config or {}
return { return {
**defaults, # defaults / config file / environment variables **default_config, # defaults / config file / environment variables
**persona_config, # lowest precedence **persona_config, # lowest precedence
**seed_config, **seed_config,
**crawl_config, **crawl_config,

View file

@ -164,13 +164,18 @@ def run_subcommand(subcommand: str,
# print('DATA_DIR is', DATA_DIR) # print('DATA_DIR is', DATA_DIR)
# print('pwd is', os.getcwd()) # print('pwd is', os.getcwd())
cmd_requires_db = subcommand in archive_cmds cmd_requires_db = (subcommand in archive_cmds)
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
check_db = cmd_requires_db and not init_pending check_db = cmd_requires_db and not init_pending
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db) setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
for ignore_pattern in ('help', '-h', '--help', 'version', '--version'):
if ignore_pattern in sys.argv[:4]:
cmd_requires_db = False
break
if subcommand in archive_cmds: if subcommand in archive_cmds:
if cmd_requires_db: if cmd_requires_db:
check_migrations() check_migrations()

View file

@ -1,18 +1,18 @@
__package__ = 'archivebox.config' __package__ = 'archivebox.config'
import os
import sys import sys
import shutil import shutil
import tempfile
from typing import Dict, Optional from typing import Dict, Optional
from pathlib import Path from pathlib import Path
from rich import print from rich import print
from pydantic import Field, field_validator, computed_field from pydantic import Field, field_validator, computed_field, model_validator
from django.utils.crypto import get_random_string from django.utils.crypto import get_random_string
from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_configset import BaseConfigSet
from .constants import CONSTANTS from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME from .version import get_COMMIT_HASH, get_BUILD_TIME
from .permissions import IN_DOCKER from .permissions import IN_DOCKER
@ -35,7 +35,6 @@ class ShellConfig(BaseConfigSet):
VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)}, VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)},
CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)}, CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)},
@computed_field @computed_field
@property @property
def TERM_WIDTH(self) -> int: def TERM_WIDTH(self) -> int:
@ -57,6 +56,16 @@ SHELL_CONFIG = ShellConfig()
class StorageConfig(BaseConfigSet): class StorageConfig(BaseConfigSet):
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be a short path due to unix path length restrictions for socket files (<100 chars)
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be able to contain executable binaries (up to 5GB size)
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
OUTPUT_PERMISSIONS: str = Field(default='644') OUTPUT_PERMISSIONS: str = Field(default='644')
RESTRICT_FILE_NAMES: str = Field(default='windows') RESTRICT_FILE_NAMES: str = Field(default='windows')
ENFORCE_ATOMIC_WRITES: bool = Field(default=True) ENFORCE_ATOMIC_WRITES: bool = Field(default=True)

View file

@ -1,6 +1,5 @@
__package__ = 'archivebox.config' __package__ = 'archivebox.config'
import os
import re import re
import sys import sys
@ -97,14 +96,10 @@ class ConstantsDict(Mapping):
# Runtime dirs # Runtime dirs
TMP_DIR_NAME: str = 'tmp' TMP_DIR_NAME: str = 'tmp'
TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
LIB_DIR_NAME: str = 'lib' LIB_DIR_NAME: str = 'lib'
LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
BIN_DIR: Path = LIB_BIN_DIR
# Config constants # Config constants
TIMEZONE: str = 'UTC' TIMEZONE: str = 'UTC'
@ -198,91 +193,7 @@ class ConstantsDict(Mapping):
".archivebox_id", ".archivebox_id",
"Dockerfile", "Dockerfile",
)) ))
CODE_LOCATIONS = benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
},
'TEMPLATES_DIR': {
'path': TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list
},
'CUSTOM_TEMPLATES_DIR': {
'path': CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': os.path.isdir(CUSTOM_TEMPLATES_DIR),
'is_valid': os.path.isdir(CUSTOM_TEMPLATES_DIR) and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
'USER_PLUGINS_DIR': {
'path': USER_PLUGINS_DIR.resolve(),
'enabled': os.path.isdir(USER_PLUGINS_DIR),
'is_valid': os.path.isdir(USER_PLUGINS_DIR) and os.access(USER_PLUGINS_DIR, os.R_OK), # read
},
'LIB_DIR': {
'path': LIB_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(LIB_DIR) and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.W_OK), # read + write
},
})
DATA_LOCATIONS = benedict({
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONFIG_FILE) and os.access(CONFIG_FILE, os.R_OK) and os.access(CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": QUEUE_DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(QUEUE_DATABASE_FILE) and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(SOURCES_DIR) and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(PERSONAS_DIR),
"is_valid": os.path.isdir(PERSONAS_DIR) and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(LOGS_DIR) and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': TMP_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(TMP_DIR) and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
})
@classmethod @classmethod
def __getitem__(cls, key: str): def __getitem__(cls, key: str):

View file

@ -258,6 +258,9 @@ def load_config_val(key: str,
elif type is list or type is dict: elif type is list or type is dict:
return json.loads(val) return json.loads(val)
elif type is Path:
return Path(val)
raise Exception('Config values can only be str, bool, int, or json') raise Exception('Config values can only be str, bool, int, or json')
@ -574,7 +577,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
with SudoPermission(uid=0): with SudoPermission(uid=0):
# running as root is a special case where it's ok to be a bit slower # running as root is a special case where it's ok to be a bit slower
# make sure data dir is always owned by the correct user # make sure data dir is always owned by the correct user
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"') os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
bump_startup_progress_bar() bump_startup_progress_bar()

View file

@ -1,12 +1,16 @@
__package__ = 'archivebox.config' __package__ = 'archivebox.config'
import os import os
import socket
import hashlib import hashlib
import tempfile
import platform import platform
from pathlib import Path from pathlib import Path
from functools import cache from functools import cache
from datetime import datetime from datetime import datetime
from benedict import benedict
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
############################################################################################# #############################################################################################
@ -88,7 +92,7 @@ def get_machine_type() -> str:
return LIB_DIR_SCOPE return LIB_DIR_SCOPE
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool: def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool:
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
current_uid, current_gid = os.geteuid(), os.getegid() current_uid, current_gid = os.geteuid(), os.getegid()
uid, gid = uid or current_uid, gid or current_gid uid, gid = uid or current_uid, gid or current_gid
@ -101,10 +105,197 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No
test_file.unlink() test_file.unlink()
return True return True
except (IOError, OSError, PermissionError): except (IOError, OSError, PermissionError):
pass if chown:
# try fixing it using sudo permissions
with SudoPermission(uid=uid, fallback=fallback):
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
return False return False
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
from archivebox.logging_util import pretty_path
try:
socket_path = str(dir_path / '.test_socket.sock')
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
os.remove(socket_path)
except OSError:
pass
s.bind(socket_path)
s.close()
try:
os.remove(socket_path)
except OSError:
pass
except Exception as e:
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
return True
def create_and_chown_dir(dir_path: Path) -> None:
with SudoPermission(uid=0, fallback=True):
dir_path.mkdir(parents=True, exist_ok=True)
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
@cache
def get_or_create_working_tmp_dir(autofix=True, quiet=False):
from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_tmp_dir
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.TMP_DIR, # <user-specified>
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
]
for candidate in CANDIDATES:
try:
create_and_chown_dir(candidate)
except Exception:
pass
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet)
return candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
@cache
def get_or_create_working_lib_dir(autofix=True, quiet=False):
from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_lib_dir
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.LIB_DIR, # <user-specified>
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
]
for candidate in CANDIDATES:
try:
create_and_chown_dir(candidate)
except Exception:
pass
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet)
return candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
@cache
def get_data_locations():
from archivebox.config import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
return benedict({
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONSTANTS.CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": CONSTANTS.QUEUE_DATABASE_FILE,
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": CONSTANTS.SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": CONSTANTS.PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": CONSTANTS.LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': STORAGE_CONFIG.TMP_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
})
@cache
def get_code_locations():
from archivebox.config import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
return benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
},
'TEMPLATES_DIR': {
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
},
'CUSTOM_TEMPLATES_DIR': {
'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR),
'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
'USER_PLUGINS_DIR': {
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
},
'LIB_DIR': {
'path': STORAGE_CONFIG.LIB_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write
},
})
# @cache # @cache

View file

@ -510,7 +510,7 @@ def log_removal_finished(all_links: int, to_remove: int):
### Helpers ### Helpers
@enforce_types @enforce_types
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str: def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = str(Path(pwd)) # .resolve() pwd = str(Path(pwd)) # .resolve()
path = str(path) path = str(path)
@ -520,7 +520,10 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str:
# replace long absolute paths with ./ relative ones to save on terminal output width # replace long absolute paths with ./ relative ones to save on terminal output width
if path.startswith(pwd) and (pwd != '/') and path != pwd: if path.startswith(pwd) and (pwd != '/') and path != pwd:
path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1) if color:
path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
else:
path = path.replace(pwd, '.', 1)
# quote paths containing spaces # quote paths containing spaces
if ' ' in path: if ' ' in path:

View file

@ -189,6 +189,7 @@ def version(quiet: bool=False,
if quiet or '--version' in sys.argv: if quiet or '--version' in sys.argv:
return return
from rich.panel import Panel
from rich.console import Console from rich.console import Console
console = Console() console = Console()
prnt = console.print prnt = console.print
@ -197,6 +198,7 @@ def version(quiet: bool=False,
from django.conf import settings from django.conf import settings
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
from archivebox.config.paths import get_data_locations, get_code_locations
from abx.archivebox.base_binary import BaseBinary, apt, brew, env from abx.archivebox.base_binary import BaseBinary, apt, brew, env
@ -221,7 +223,7 @@ def version(quiet: bool=False,
f'PLATFORM={platform.platform()}', f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''), f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
) )
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt( prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}', f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
@ -240,6 +242,21 @@ def version(quiet: bool=False,
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
) )
prnt() prnt()
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
PANEL_TEXT = '\n'.join((
# '',
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
'',
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
'',
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
'',
))
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
prnt()
return
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]') prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
failures = [] failures = []
@ -299,13 +316,13 @@ def version(quiet: bool=False,
prnt() prnt()
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]') prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
for name, path in CONSTANTS.CODE_LOCATIONS.items(): for name, path in get_code_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt() prnt()
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
prnt('[bright_yellow][i] Data locations:[/bright_yellow]') prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
for name, path in CONSTANTS.DATA_LOCATIONS.items(): for name, path in get_data_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
from archivebox.misc.checks import check_data_dir_permissions from archivebox.misc.checks import check_data_dir_permissions
@ -395,7 +412,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}') print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
# from django.contrib.auth.models import User # from django.contrib.auth.models import User
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exists(): # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI)) # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
# call_command("createsuperuser", interactive=True) # call_command("createsuperuser", interactive=True)
@ -486,9 +503,13 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
html_index.rename(f"{index_name}.html") html_index.rename(f"{index_name}.html")
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True) CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True) CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
from archivebox.config.common import STORAGE_CONFIG
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
if install: if install:
run_subcommand('install', pwd=out_dir) run_subcommand('install', pwd=out_dir)
@ -1115,7 +1136,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
User = get_user_model() User = get_user_model()
if not User.objects.filter(is_superuser=True).exists(): if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green') stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser') stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
@ -1399,46 +1420,43 @@ def server(runserver_args: Optional[List[str]]=None,
from django.core.management import call_command from django.core.management import call_command
from django.contrib.auth.models import User from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
print()
# print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]')
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
print(' [green]archivebox manage createsuperuser[/green]')
print()
print('[green][+] Starting ArchiveBox webserver...[/green]') host = '127.0.0.1'
print(' > Logging errors to ./logs/errors.log') port = '8000'
if not User.objects.filter(is_superuser=True).exists():
print('[yellow][!] No admin users exist yet, you will not be able to edit links in the UI.[/yellow]')
print()
print(' [violet]Hint:[/violet] To create an admin user, run:')
print(' archivebox manage createsuperuser')
print()
try:
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
if ':' in host_and_port:
host, port = host_and_port.split(':')
else:
if '.' in host_and_port:
host = host_and_port
else:
port = host_and_port
except IndexError:
pass
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if SHELL_CONFIG.DEBUG: if SHELL_CONFIG.DEBUG:
if not reload: if not reload:
runserver_args.append('--noreload') # '--insecure' runserver_args.append('--noreload') # '--insecure'
call_command("runserver", *runserver_args) call_command("runserver", *runserver_args)
else: else:
host = '127.0.0.1'
port = '8000'
try:
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
if ':' in host_and_port:
host, port = host_and_port.split(':')
else:
if '.' in host_and_port:
host = host_and_port
else:
port = host_and_port
except IndexError:
pass
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
from queues.supervisor_util import start_server_workers from queues.supervisor_util import start_server_workers
print() print()
start_server_workers(host=host, port=port, daemonize=False) start_server_workers(host=host, port=port, daemonize=False)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")

View file

@ -5,16 +5,24 @@ import sys
from pathlib import Path from pathlib import Path
from rich import print from rich import print
from rich.panel import Panel
# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE # DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries
# this file is imported by archivebox/__init__.py # this file is imported by archivebox/__init__.py
# and any imports here will be imported by EVERYTHING else # and any imports here will be imported by EVERYTHING else
# so this file should only be used for pure python checks # so this file should only be used for pure python checks
# that don't need to import other parts of ArchiveBox # that don't need to import other parts of ArchiveBox
# if a check needs to import other parts of ArchiveBox,
# the imports should be done inside the check function
# and you should make sure if you need to import any django stuff
# that the check is called after django.setup() has been called
def check_data_folder() -> None: def check_data_folder() -> None:
from archivebox import DATA_DIR, ARCHIVE_DIR from archivebox import DATA_DIR, ARCHIVE_DIR
from archivebox.config import CONSTANTS
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir() archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()
if not archive_dir_exists: if not archive_dir_exists:
@ -30,8 +38,21 @@ def check_data_folder() -> None:
raise SystemExit(2) raise SystemExit(2)
# Create data dir subdirs
create_and_chown_dir(CONSTANTS.SOURCES_DIR)
create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default')
create_and_chown_dir(CONSTANTS.LOGS_DIR)
# create_and_chown_dir(CONSTANTS.CACHE_DIR)
# Create /tmp and /lib dirs if they don't exist
get_or_create_working_tmp_dir(autofix=True, quiet=False)
get_or_create_working_lib_dir(autofix=True, quiet=False)
# Check data dir permissions, /tmp, and /lib permissions
check_data_dir_permissions()
def check_migrations(): def check_migrations():
from archivebox import DATA_DIR, CONSTANTS from archivebox import DATA_DIR
from ..index.sql import list_migrations from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status] pending_migrations = [name for status, name in list_migrations() if not status]
@ -45,13 +66,6 @@ def check_migrations():
print(' archivebox init', file=sys.stderr) print(' archivebox init', file=sys.stderr)
raise SystemExit(3) raise SystemExit(3)
CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
CONSTANTS.LOGS_DIR.mkdir(exist_ok=True)
# CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
(CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
(CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
def check_io_encoding(): def check_io_encoding():
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8') PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
@ -128,3 +142,98 @@ def check_data_dir_permissions():
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
from archivebox.config.common import STORAGE_CONFIG
# Check /tmp dir permissions
check_tmp_dir(STORAGE_CONFIG.TMP_DIR, throw=False, must_exist=True)
# Check /lib dir permissions
check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True)
def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir
from archivebox.misc.logging import STDERR
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.common import STORAGE_CONFIG
from archivebox.logging_util import pretty_path
tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR
socket_file = tmp_dir.absolute().resolve() / "supervisord.sock"
if not must_exist and not os.path.isdir(tmp_dir):
# just check that its viable based on its length (because dir may not exist yet, we cant check if its writable)
return len(f'file://{socket_file}') <= 96
tmp_is_valid = False
try:
tmp_is_valid = dir_is_writable(tmp_dir)
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
return True
except Exception as e:
if not quiet:
STDERR.print()
ERROR_TEXT = '\n'.join((
'',
f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]',
f' [yellow]{e}[/yellow]',
'',
'[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.',
' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).',
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.',
' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]',
'',
'[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:',
f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]',
'',
))
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.'))
STDERR.print()
if throw:
raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e
return False
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.misc.logging import STDERR
from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
from archivebox.config.common import STORAGE_CONFIG
from archivebox.logging_util import pretty_path
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
if not must_exist and not os.path.isdir(lib_dir):
return True
lib_is_valid = False
try:
lib_is_valid = dir_is_writable(lib_dir)
assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}'
return True
except Exception as e:
if not quiet:
STDERR.print()
ERROR_TEXT = '\n'.join((
'',
f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]',
f' [yellow]{e}[/yellow]',
'',
'[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.',
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).',
' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]',
'',
'[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:',
f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]',
'',
))
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]'))
STDERR.print()
if throw:
raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e
return False

View file

@ -49,7 +49,7 @@ if __name__ == '__main__':
prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!') prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!')
prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]') prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]')
prnt(' [link=https://docs.archivebox.io/en/latest/modules.html]https://docs.archivebox.io/en/latest/modules.html[/link]') prnt(' [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]')
prnt() prnt()
prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]') prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]')
prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]') prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]')

View file

@ -82,10 +82,10 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
if isinstance(default, Callable): if isinstance(default, Callable):
return '{{lambda ' + inspect.getsource(default).split('=lambda ')[-1].strip()[:-1] + '}}' return '{{lambda ' + inspect.getsource(default).split('=lambda ')[-1].strip()[:-1] + '}}'
return to_jsonable_python( return to_jsonable_python(
default, default,
timedelta_mode=config.ser_json_timedelta, timedelta_mode=config.ser_json_timedelta,
bytes_mode=config.ser_json_bytes, bytes_mode=config.ser_json_bytes,
serialize_unknown=True serialize_unknown=True
) )
# for computed_field properties render them like this instead: # for computed_field properties render them like this instead:

View file

@ -104,7 +104,10 @@ class ChromeBinary(BaseBinary):
} }
@staticmethod @staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None: def symlink_to_lib(binary, bin_dir=None) -> None:
from archivebox.config.common import STORAGE_CONFIG
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
if not (binary.abspath and os.access(binary.abspath, os.F_OK)): if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return return

View file

@ -3,8 +3,6 @@ __package__ = 'plugins_pkg.npm'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from pydantic import model_validator
from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
from archivebox.config import DATA_DIR, CONSTANTS from archivebox.config import DATA_DIR, CONSTANTS
@ -14,7 +12,7 @@ from abx.archivebox.base_binary import BaseBinProvider
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin' OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin' NEW_NODE_BIN_PATH = CONSTANTS.DEFAULT_LIB_DIR / 'npm' / 'node_modules' / '.bin'
class SystemNpmBinProvider(NpmProvider, BaseBinProvider): class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
@ -27,12 +25,16 @@ class LibNpmBinProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "lib_npm" name: BinProviderName = "lib_npm"
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR npm_prefix: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'npm'
@model_validator(mode='after') def setup(self) -> None:
def validate_path(self): # update paths from config if they arent the default
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent from archivebox.config.common import STORAGE_CONFIG
return self if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR:
self.npm_prefix = STORAGE_CONFIG.LIB_DIR / 'npm'
self.PATH = f'{STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
super().setup()
SYS_NPM_BINPROVIDER = SystemNpmBinProvider() SYS_NPM_BINPROVIDER = SystemNpmBinProvider()

View file

@ -49,7 +49,15 @@ class LibPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "lib_pip" name: BinProviderName = "lib_pip"
INSTALLER_BIN: BinName = "pip" INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv' pip_venv: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'pip' / 'venv'
def setup(self) -> None:
# update paths from config if they arent the default
from archivebox.config.common import STORAGE_CONFIG
if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR:
self.pip_venv = STORAGE_CONFIG.LIB_DIR / 'pip' / 'venv'
super().setup()
SYS_PIP_BINPROVIDER = SystemPipBinProvider() SYS_PIP_BINPROVIDER = SystemPipBinProvider()
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()

View file

@ -35,7 +35,7 @@ class PlaywrightBinProvider(BaseBinProvider):
name: BinProviderName = "playwright" name: BinProviderName = "playwright"
INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}" PATH: PATHStr = f"{CONSTANTS.DEFAULT_LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}"
playwright_browsers_dir: Path = ( playwright_browsers_dir: Path = (
MACOS_PLAYWRIGHT_CACHE_DIR.expanduser() MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
@ -56,6 +56,11 @@ class PlaywrightBinProvider(BaseBinProvider):
return PLAYWRIGHT_BINARY.load().abspath return PLAYWRIGHT_BINARY.load().abspath
def setup(self) -> None: def setup(self) -> None:
# update paths from config if they arent the default
from archivebox.config.common import STORAGE_CONFIG
if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR:
self.PATH = f"{STORAGE_CONFIG.LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}"
assert SYS_PIP_BINPROVIDER.INSTALLER_BIN_ABSPATH, "Pip bin provider not initialized" assert SYS_PIP_BINPROVIDER.INSTALLER_BIN_ABSPATH, "Pip bin provider not initialized"
if self.playwright_browsers_dir: if self.playwright_browsers_dir:

View file

@ -23,19 +23,16 @@ from abx.archivebox.base_binary import BaseBinProvider
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
class PuppeteerBinProvider(BaseBinProvider): class PuppeteerBinProvider(BaseBinProvider):
name: BinProviderName = "puppeteer" name: BinProviderName = "puppeteer"
INSTALLER_BIN: BinName = "npx" INSTALLER_BIN: BinName = "npx"
PATH: PATHStr = str(CONSTANTS.LIB_BIN_DIR) PATH: PATHStr = str(CONSTANTS.DEFAULT_LIB_DIR / 'bin')
euid: Optional[int] = ARCHIVEBOX_USER euid: Optional[int] = ARCHIVEBOX_USER
puppeteer_browsers_dir: Path = LIB_DIR_BROWSERS puppeteer_browsers_dir: Path = CONSTANTS.DEFAULT_LIB_DIR / 'browsers'
puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)] puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install"]
packages_handler: BinProviderOverrides = Field(default={ packages_handler: BinProviderOverrides = Field(default={
"chrome": lambda: "chrome": lambda:
@ -45,6 +42,11 @@ class PuppeteerBinProvider(BaseBinProvider):
_browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {} _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
def setup(self) -> None: def setup(self) -> None:
# update paths from config
from archivebox.config.common import STORAGE_CONFIG
self.puppeteer_browsers_dir = STORAGE_CONFIG.LIB_DIR / 'browsers'
self.PATH = str(STORAGE_CONFIG.LIB_DIR / 'bin')
assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized" assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized"
if self.puppeteer_browsers_dir: if self.puppeteer_browsers_dir:
@ -90,7 +92,7 @@ class PuppeteerBinProvider(BaseBinProvider):
# print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}') # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
install_args = [*self.puppeteer_install_args] install_args = [*self.puppeteer_install_args, "--path", str(self.puppeteer_browsers_dir)]
proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages]) proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages])

View file

@ -1,40 +0,0 @@
import tempfile
from pathlib import Path
from functools import cache
from archivebox.config import CONSTANTS
from archivebox.config.paths import get_collection_id
DATA_DIR = CONSTANTS.DATA_DIR
LOGS_DIR = CONSTANTS.LOGS_DIR
TMP_DIR = CONSTANTS.TMP_DIR
SUPERVISORD_CONFIG_FILE = TMP_DIR / "supervisord.conf"
PID_FILE = TMP_DIR / "supervisord.pid"
SOCK_FILE = TMP_DIR / "supervisord.sock"
LOG_FILE = TMP_DIR / "supervisord.log"
WORKERS_DIR = TMP_DIR / "workers"
@cache
def get_sock_file():
"""Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits"""
TMP_DIR.mkdir(parents=True, exist_ok=True)
if len(f'file://{SOCK_FILE.absolute().resolve()}') > 98:
# socket absolute paths cannot be longer than 104 bytes on macos, and 108 bytes on linux
# symlink it to a shorter path and use that instead
# place the actual socket file in a shorter tmp dir
# /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox_supervisord_3d1e544e.sock
shorter_sock_file = Path(tempfile.gettempdir()) / f"archivebox_supervisord_{get_collection_id()}.sock"
# symlink ./data/tmp/<collection_id>/supervisord.sock -> /var/folders/qy/abc234235/T/archivebox_supervisord_3d1e544e.sock
# for convenience/consistency
symlink = SOCK_FILE
symlink.unlink(missing_ok=True)
symlink.symlink_to(shorter_sock_file)
assert len(f'file://{shorter_sock_file}') <= 98, f'Failed to create supervisord SOCK_FILE, system tmp dir location is too long {shorter_sock_file} (unix only allows 108 characters for socket paths)'
return shorter_sock_file
return SOCK_FILE

View file

@ -1,23 +1,39 @@
__package__ = 'archivebox.queues' __package__ = 'archivebox.queues'
import sys
import time import time
import signal import signal
import psutil import psutil
import shutil import shutil
import subprocess import subprocess
from typing import Dict, cast, Iterator
from pathlib import Path from pathlib import Path
from functools import cache
from rich import print from rich import print
from typing import Dict, cast
from supervisor.xmlrpc import SupervisorTransport from supervisor.xmlrpc import SupervisorTransport
from xmlrpc.client import ServerProxy from xmlrpc.client import ServerProxy
from archivebox.config import CONSTANTS
from archivebox.config.paths import get_or_create_working_tmp_dir
from archivebox.config.permissions import ARCHIVEBOX_USER from archivebox.config.permissions import ARCHIVEBOX_USER
from archivebox.misc.logging import STDERR
from archivebox.logging_util import pretty_path
from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, get_sock_file, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR LOG_FILE_NAME = "supervisord.log"
CONFIG_FILE_NAME = "supervisord.conf"
PID_FILE_NAME = "supervisord.pid"
WORKERS_DIR_NAME = "workers"
from typing import Iterator @cache
def get_sock_file():
"""Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits"""
TMP_DIR = get_or_create_working_tmp_dir(autofix=True, quiet=False)
assert TMP_DIR, "Failed to find or create a writable TMP_DIR!"
socket_file = TMP_DIR / "supervisord.sock"
return socket_file
def follow(file, sleep_sec=0.1) -> Iterator[str]: def follow(file, sleep_sec=0.1) -> Iterator[str]:
""" Yield each line from a file as they are written. """ Yield each line from a file as they are written.
@ -35,24 +51,30 @@ def follow(file, sleep_sec=0.1) -> Iterator[str]:
def create_supervisord_config(): def create_supervisord_config():
SOCK_FILE = get_sock_file()
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME
config_content = f""" config_content = f"""
[supervisord] [supervisord]
nodaemon = true nodaemon = true
environment = IS_SUPERVISORD_PARENT="true" environment = IS_SUPERVISORD_PARENT="true"
pidfile = {TMP_DIR}/{PID_FILE.name} pidfile = {PID_FILE}
logfile = {LOGS_DIR}/{LOG_FILE.name} logfile = {LOG_FILE}
childlogdir = {LOGS_DIR} childlogdir = {CONSTANTS.LOGS_DIR}
directory = {DATA_DIR} directory = {CONSTANTS.DATA_DIR}
strip_ansi = true strip_ansi = true
nocleanup = true nocleanup = true
user = {ARCHIVEBOX_USER} user = {ARCHIVEBOX_USER}
[unix_http_server] [unix_http_server]
file = {get_sock_file()} file = {SOCK_FILE}
chmod = 0700 chmod = 0700
[supervisorctl] [supervisorctl]
serverurl = unix://{get_sock_file()} serverurl = unix://{SOCK_FILE}
[rpcinterface:supervisor] [rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
@ -61,9 +83,14 @@ supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
files = {WORKERS_DIR}/*.conf files = {WORKERS_DIR}/*.conf
""" """
SUPERVISORD_CONFIG_FILE.write_text(config_content) CONFIG_FILE.write_text(config_content)
Path.mkdir(WORKERS_DIR, exist_ok=True)
(WORKERS_DIR / 'initial_startup.conf').write_text('') # hides error about "no files found to include" when supervisord starts
def create_worker_config(daemon): def create_worker_config(daemon):
SOCK_FILE = get_sock_file()
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
Path.mkdir(WORKERS_DIR, exist_ok=True) Path.mkdir(WORKERS_DIR, exist_ok=True)
name = daemon['name'] name = daemon['name']
@ -80,13 +107,14 @@ def create_worker_config(daemon):
def get_existing_supervisord_process(): def get_existing_supervisord_process():
SOCK_FILE = get_sock_file()
try: try:
transport = SupervisorTransport(None, None, f"unix://{get_sock_file()}") transport = SupervisorTransport(None, None, f"unix://{SOCK_FILE}")
server = ServerProxy("http://localhost", transport=transport) server = ServerProxy("http://localhost", transport=transport)
current_state = cast(Dict[str, int | str], server.supervisor.getState()) current_state = cast(Dict[str, int | str], server.supervisor.getState())
if current_state["statename"] == "RUNNING": if current_state["statename"] == "RUNNING":
pid = server.supervisor.getPID() pid = server.supervisor.getPID()
print(f"[🦸‍♂️] Supervisord connected (pid={pid}) via unix://{str(get_sock_file()).replace(str(DATA_DIR), '.')}.") print(f"[🦸‍♂️] Supervisord connected (pid={pid}) via unix://{pretty_path(SOCK_FILE)}.")
return server.supervisor return server.supervisor
except FileNotFoundError: except FileNotFoundError:
return None return None
@ -95,58 +123,83 @@ def get_existing_supervisord_process():
return None return None
def stop_existing_supervisord_process(): def stop_existing_supervisord_process():
SOCK_FILE = get_sock_file()
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
try: try:
pid = int(PID_FILE.read_text()) try:
except FileNotFoundError: pid = int(PID_FILE.read_text())
return except (FileNotFoundError, ValueError):
except ValueError: return
PID_FILE.unlink()
return
try: try:
print(f"[🦸‍♂️] Stopping supervisord process (pid={pid})...") print(f"[🦸‍♂️] Stopping supervisord process (pid={pid})...")
proc = psutil.Process(pid) proc = psutil.Process(pid)
proc.terminate() proc.terminate()
proc.wait() proc.wait()
except Exception: except (Exception, BrokenPipeError, IOError):
pass pass
try: finally:
PID_FILE.unlink() try:
except FileNotFoundError: # clear PID file and socket file
pass PID_FILE.unlink(missing_ok=True)
get_sock_file().unlink(missing_ok=True)
except Exception:
pass
def start_new_supervisord_process(daemonize=False): def start_new_supervisord_process(daemonize=False):
SOCK_FILE = get_sock_file()
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
print(f"[🦸‍♂️] Supervisord starting{' in background' if daemonize else ''}...") print(f"[🦸‍♂️] Supervisord starting{' in background' if daemonize else ''}...")
# Create a config file in the current working directory pretty_log_path = pretty_path(LOG_FILE)
print(f" > Writing supervisord logs to: {pretty_log_path}")
print(f" > Writing task worker logs to: {pretty_log_path.replace('supervisord.log', 'worker_*.log')}")
print(f' > Using supervisord config file: {pretty_path(CONFIG_FILE)}')
print(f" > Using supervisord UNIX socket: {pretty_path(SOCK_FILE)}")
print()
# clear out existing stale state files # clear out existing stale state files
shutil.rmtree(WORKERS_DIR, ignore_errors=True) shutil.rmtree(WORKERS_DIR, ignore_errors=True)
PID_FILE.unlink(missing_ok=True) PID_FILE.unlink(missing_ok=True)
get_sock_file().unlink(missing_ok=True) get_sock_file().unlink(missing_ok=True)
SUPERVISORD_CONFIG_FILE.unlink(missing_ok=True) CONFIG_FILE.unlink(missing_ok=True)
# create the supervisord config file
create_supervisord_config() create_supervisord_config()
# Start supervisord # Start supervisord
# panel = Panel(f"Starting supervisord with config: {SUPERVISORD_CONFIG_FILE}")
# with Live(panel, refresh_per_second=1) as live:
subprocess.Popen( subprocess.Popen(
f"supervisord --configuration={SUPERVISORD_CONFIG_FILE}", f"supervisord --configuration={CONFIG_FILE}",
stdin=None, stdin=None,
shell=True, shell=True,
start_new_session=daemonize, start_new_session=daemonize,
) )
def exit_signal_handler(signum, frame): def exit_signal_handler(signum, frame):
if signum != 13: if signum == 2:
print(f"\n[🦸‍♂️] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...") STDERR.print("\n[🛑] Got Ctrl+C. Terminating child processes...")
elif signum != 13:
STDERR.print(f"\n[🦸‍♂️] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...")
stop_existing_supervisord_process() stop_existing_supervisord_process()
raise SystemExit(0) raise SystemExit(0)
# Monitor for termination signals and cleanup child processes # Monitor for termination signals and cleanup child processes
if not daemonize: if not daemonize:
signal.signal(signal.SIGINT, exit_signal_handler) try:
signal.signal(signal.SIGHUP, exit_signal_handler) signal.signal(signal.SIGINT, exit_signal_handler)
signal.signal(signal.SIGPIPE, exit_signal_handler) signal.signal(signal.SIGHUP, exit_signal_handler)
signal.signal(signal.SIGTERM, exit_signal_handler) signal.signal(signal.SIGPIPE, exit_signal_handler)
signal.signal(signal.SIGTERM, exit_signal_handler)
except Exception:
# signal handlers only work in main thread
pass
# otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode) # otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode)
time.sleep(2) time.sleep(2)
@ -154,14 +207,32 @@ def start_new_supervisord_process(daemonize=False):
return get_existing_supervisord_process() return get_existing_supervisord_process()
def get_or_create_supervisord_process(daemonize=False): def get_or_create_supervisord_process(daemonize=False):
SOCK_FILE = get_sock_file()
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
supervisor = get_existing_supervisord_process() supervisor = get_existing_supervisord_process()
if supervisor is None: if supervisor is None:
stop_existing_supervisord_process() stop_existing_supervisord_process()
supervisor = start_new_supervisord_process(daemonize=daemonize) supervisor = start_new_supervisord_process(daemonize=daemonize)
time.sleep(0.5) time.sleep(0.5)
# wait up to 5s in case supervisord is slow to start
if not supervisor:
for _ in range(10):
if supervisor is not None:
print()
break
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(0.5)
supervisor = get_existing_supervisord_process()
else:
print()
assert supervisor, "Failed to start supervisord or connect to it!" assert supervisor, "Failed to start supervisord or connect to it!"
supervisor.getPID() # make sure it doesn't throw an exception supervisor.getPID() # make sure it doesn't throw an exception
(WORKERS_DIR / 'initial_startup.conf').unlink(missing_ok=True)
return supervisor return supervisor
@ -242,9 +313,9 @@ def tail_worker_logs(log_path: str):
for line in follow(f): for line in follow(f):
if '://' in line: if '://' in line:
live.console.print(f"Working on: {line.strip()}") live.console.print(f"Working on: {line.strip()}")
table.add_row("123124234", line.strip()) # table.add_row("123124234", line.strip())
except KeyboardInterrupt: except (KeyboardInterrupt, BrokenPipeError, IOError):
print("\n[🛑] Got Ctrl+C, stopping gracefully...") STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
except SystemExit: except SystemExit:
pass pass
@ -321,12 +392,12 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
if not daemonize: if not daemonize:
try: try:
watch_worker(supervisor, "worker_daphne") watch_worker(supervisor, "worker_daphne")
except KeyboardInterrupt: except (KeyboardInterrupt, BrokenPipeError, IOError):
print("\n[🛑] Got Ctrl+C, stopping gracefully...") STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
except SystemExit: except SystemExit:
pass pass
except BaseException as e: except BaseException as e:
print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...") STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
raise raise
finally: finally:
stop_worker(supervisor, "worker_daphne") stop_worker(supervisor, "worker_daphne")
@ -350,12 +421,12 @@ def start_cli_workers(watch=False):
if watch: if watch:
try: try:
watch_worker(supervisor, "worker_system_tasks") watch_worker(supervisor, "worker_system_tasks")
except KeyboardInterrupt: except (KeyboardInterrupt, BrokenPipeError, IOError):
print("\n[🛑] Got Ctrl+C, stopping gracefully...") STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
except SystemExit: except SystemExit:
pass pass
except BaseException as e: except BaseException as e:
print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...") STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
raise raise
finally: finally:
stop_worker(supervisor, "worker_system_tasks") stop_worker(supervisor, "worker_system_tasks")