mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 19:53:06 +00:00
fix LIB_DIR and TMP_DIR loading when primary option isnt available
This commit is contained in:
parent
deb116eed4
commit
a211461ffc
21 changed files with 712 additions and 303 deletions
|
@ -37,7 +37,8 @@ class BaseBinary(Binary):
|
|||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||
bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
|
||||
|
||||
if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
|
||||
return
|
||||
|
@ -55,9 +56,10 @@ class BaseBinary(Binary):
|
|||
|
||||
@validate_call
|
||||
def load(self, fresh=False, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
if fresh:
|
||||
binary = super().load(**kwargs)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
else:
|
||||
# get cached binary from db
|
||||
try:
|
||||
|
@ -72,16 +74,18 @@ class BaseBinary(Binary):
|
|||
|
||||
@validate_call
|
||||
def install(self, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
binary = super().install(**kwargs)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
return binary
|
||||
|
||||
@validate_call
|
||||
def load_or_install(self, fresh=False, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
try:
|
||||
binary = self.load(fresh=fresh)
|
||||
if binary and binary.version:
|
||||
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
return binary
|
||||
except Exception:
|
||||
pass
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Type, Tuple, Callable, ClassVar
|
||||
from typing import Type, Tuple, Callable, ClassVar, Dict, Any
|
||||
|
||||
import toml
|
||||
from rich import print
|
||||
|
||||
from benedict import benedict
|
||||
from pydantic import model_validator, TypeAdapter
|
||||
|
@ -18,6 +23,11 @@ from . import toml_util
|
|||
PACKAGE_DIR = Path(__file__).resolve().parent.parent
|
||||
DATA_DIR = Path(os.getcwd()).resolve()
|
||||
|
||||
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
|
||||
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
|
||||
|
||||
AUTOFIXES_HEADER = "[AUTOFIXES]"
|
||||
AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:"
|
||||
|
||||
|
||||
class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
|
||||
|
@ -53,7 +63,7 @@ class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
|
|||
super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data)
|
||||
|
||||
|
||||
class ArchiveBoxBaseConfig(BaseSettings):
|
||||
class BaseConfigSet(BaseSettings):
|
||||
"""
|
||||
This is the base class for an ArchiveBox ConfigSet.
|
||||
It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables.
|
||||
|
@ -83,7 +93,7 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
loc_by_alias=False,
|
||||
validate_assignment=True,
|
||||
validate_return=True,
|
||||
revalidate_instances="always",
|
||||
revalidate_instances="subclass-instances",
|
||||
)
|
||||
|
||||
load_from_defaults: ClassVar[bool] = True
|
||||
|
@ -101,9 +111,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
) -> Tuple[PydanticBaseSettingsSource, ...]:
|
||||
"""Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables"""
|
||||
|
||||
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
|
||||
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
precedence_order = {}
|
||||
|
@ -152,27 +159,36 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
def fill_defaults(self):
|
||||
"""Populate any unset values using function provided as their default"""
|
||||
|
||||
for key, field in self.model_fields.items():
|
||||
value = getattr(self, key)
|
||||
|
||||
if isinstance(value, Callable):
|
||||
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
|
||||
if func_takes_args_or_kwargs(value):
|
||||
# assemble dict of existing field values to pass to default factory functions
|
||||
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
|
||||
computed_default = field.default(config_so_far)
|
||||
else:
|
||||
# otherwise it's a pure function with no args, just call it
|
||||
computed_default = field.default()
|
||||
|
||||
# coerce/check to make sure default factory return value matches type annotation
|
||||
TypeAdapter(field.annotation).validate_python(computed_default)
|
||||
|
||||
# set generated default value as final validated value
|
||||
setattr(self, key, computed_default)
|
||||
for key in self.model_fields.keys():
|
||||
if isinstance(getattr(self, key), Callable):
|
||||
if self.load_from_defaults:
|
||||
computed_default = self.get_default_value(key)
|
||||
# set generated default value as final validated value
|
||||
setattr(self, key, computed_default)
|
||||
return self
|
||||
|
||||
def update_in_place(self, warn=True, **kwargs):
|
||||
def get_default_value(self, key):
|
||||
"""Get the default value for a given config key"""
|
||||
field = self.model_fields[key]
|
||||
value = getattr(self, key)
|
||||
|
||||
if isinstance(value, Callable):
|
||||
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
|
||||
if func_takes_args_or_kwargs(value):
|
||||
# assemble dict of existing field values to pass to default factory functions
|
||||
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
|
||||
computed_default = field.default(config_so_far)
|
||||
else:
|
||||
# otherwise it's a pure function with no args, just call it
|
||||
computed_default = field.default()
|
||||
|
||||
# coerce/check to make sure default factory return value matches type annotation
|
||||
TypeAdapter(field.annotation).validate_python(computed_default)
|
||||
|
||||
return computed_default
|
||||
return value
|
||||
|
||||
def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
|
||||
"""
|
||||
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
|
||||
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
|
||||
|
@ -180,25 +196,106 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
|
||||
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
|
||||
"""
|
||||
from archivebox.misc.toml_util import CustomTOMLEncoder
|
||||
|
||||
if warn:
|
||||
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:')
|
||||
fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
|
||||
print(f'[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
|
||||
|
||||
# set the new values in the environment
|
||||
for key, value in kwargs.items():
|
||||
os.environ[key] = str(value)
|
||||
original_value = getattr(self, key)
|
||||
if warn:
|
||||
print(f' {key}={original_value} -> {value}')
|
||||
|
||||
# if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section
|
||||
try:
|
||||
if persist and ARCHIVEBOX_CONFIG_FILE.is_file():
|
||||
autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder())
|
||||
|
||||
existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip()
|
||||
if AUTOFIXES_HEADER in existing_config:
|
||||
existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip()
|
||||
else:
|
||||
existing_autofixes = ''
|
||||
|
||||
new_config = '\n'.join(line for line in [
|
||||
existing_config,
|
||||
'\n' + AUTOFIXES_HEADER,
|
||||
AUTOFIXES_SUBHEADER,
|
||||
existing_autofixes,
|
||||
autofixes_to_add,
|
||||
] if line.strip()).strip() + '\n'
|
||||
ARCHIVEBOX_CONFIG_FILE.write_text(new_config)
|
||||
except Exception:
|
||||
pass
|
||||
self.__init__()
|
||||
if warn:
|
||||
print(file=sys.stderr)
|
||||
|
||||
return self
|
||||
|
||||
def as_legacy_config_schema(self):
|
||||
@property
|
||||
def toml_section_header(self):
|
||||
"""Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
|
||||
class_name = self.__class__.__name__
|
||||
return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_')
|
||||
|
||||
|
||||
def from_defaults(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the default values"""
|
||||
class OnlyDefaultsConfig(self.__class__):
|
||||
load_from_defaults = True
|
||||
load_from_configfile = False
|
||||
load_from_environment = False
|
||||
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
def from_configfile(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
|
||||
class OnlyConfigFileConfig(self.__class__):
|
||||
load_from_defaults = False
|
||||
load_from_configfile = True
|
||||
load_from_environment = False
|
||||
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
def from_environment(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the environment variables"""
|
||||
class OnlyEnvironmentConfig(self.__class__):
|
||||
load_from_defaults = False
|
||||
load_from_configfile = False
|
||||
load_from_environment = True
|
||||
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
def from_computed(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the computed fields"""
|
||||
return benedict(self.model_dump(include=set(self.model_computed_fields.keys())))
|
||||
|
||||
|
||||
def to_toml_dict(self, defaults=False) -> Dict[str, Any]:
|
||||
"""Get the current config as a TOML-ready dict"""
|
||||
config_dict = {}
|
||||
for key, value in benedict(self).items():
|
||||
if defaults or value != self.get_default_value(key):
|
||||
config_dict[key] = value
|
||||
|
||||
return benedict({self.toml_section_header: config_dict})
|
||||
|
||||
def to_toml_str(self, defaults=False) -> str:
|
||||
"""Get the current config as a TOML string"""
|
||||
from archivebox.misc.toml_util import CustomTOMLEncoder
|
||||
|
||||
toml_dict = self.to_toml_dict(defaults=defaults)
|
||||
if not toml_dict[self.toml_section_header]:
|
||||
# if the section is empty, don't write it
|
||||
toml_dict.pop(self.toml_section_header)
|
||||
|
||||
return toml.dumps(toml_dict, encoder=CustomTOMLEncoder())
|
||||
|
||||
def as_legacy_config_schema(self) -> Dict[str, Any]:
|
||||
# shim for backwards compatibility with old config schema style
|
||||
model_values = self.model_dump()
|
||||
return benedict({
|
||||
key: {'type': field.annotation, 'default': model_values[key]}
|
||||
for key, field in self.model_fields.items()
|
||||
})
|
||||
|
||||
|
||||
class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg]
|
||||
|
||||
pass
|
||||
|
|
|
@ -18,13 +18,7 @@ def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
|
|||
def get_CONFIG() -> Dict[str, BaseConfigSet]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_BINARIES() -> Dict[str, BaseBinary]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
|
||||
|
@ -45,3 +39,14 @@ def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
|
|||
# @hookspec
|
||||
# def get_QUEUES():
|
||||
# return {}
|
||||
|
||||
|
||||
##############################################################
|
||||
# provided by abx.pydantic_pkgr.hookspec:
|
||||
# @hookspec
|
||||
# def get_BINARIES() -> Dict[str, BaseBinary]:
|
||||
# return {}
|
||||
|
||||
# @hookspec
|
||||
# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
|
||||
# return {}
|
||||
|
|
|
@ -131,9 +131,12 @@ def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
|
|||
|
||||
|
||||
|
||||
def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
|
||||
def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
|
||||
"""Get all the relevant config for the given scope, in correct precedence order"""
|
||||
|
||||
from django.conf import settings
|
||||
default_config: benedict = defaults or settings.CONFIG
|
||||
|
||||
snapshot = snapshot or (archiveresult and archiveresult.snapshot)
|
||||
crawl = crawl or (snapshot and snapshot.crawl)
|
||||
seed = seed or (crawl and crawl.seed)
|
||||
|
@ -147,7 +150,7 @@ def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=No
|
|||
extra_config = extra_config or {}
|
||||
|
||||
return {
|
||||
**defaults, # defaults / config file / environment variables
|
||||
**default_config, # defaults / config file / environment variables
|
||||
**persona_config, # lowest precedence
|
||||
**seed_config,
|
||||
**crawl_config,
|
||||
|
|
|
@ -164,13 +164,18 @@ def run_subcommand(subcommand: str,
|
|||
# print('DATA_DIR is', DATA_DIR)
|
||||
# print('pwd is', os.getcwd())
|
||||
|
||||
cmd_requires_db = subcommand in archive_cmds
|
||||
cmd_requires_db = (subcommand in archive_cmds)
|
||||
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
|
||||
|
||||
check_db = cmd_requires_db and not init_pending
|
||||
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
|
||||
|
||||
for ignore_pattern in ('help', '-h', '--help', 'version', '--version'):
|
||||
if ignore_pattern in sys.argv[:4]:
|
||||
cmd_requires_db = False
|
||||
break
|
||||
|
||||
if subcommand in archive_cmds:
|
||||
if cmd_requires_db:
|
||||
check_migrations()
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
import tempfile
|
||||
from typing import Dict, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from rich import print
|
||||
from pydantic import Field, field_validator, computed_field
|
||||
from pydantic import Field, field_validator, computed_field, model_validator
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
from .constants import CONSTANTS
|
||||
from .version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from .permissions import IN_DOCKER
|
||||
|
@ -35,7 +35,6 @@ class ShellConfig(BaseConfigSet):
|
|||
VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)},
|
||||
CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)},
|
||||
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def TERM_WIDTH(self) -> int:
|
||||
|
@ -57,6 +56,16 @@ SHELL_CONFIG = ShellConfig()
|
|||
|
||||
|
||||
class StorageConfig(BaseConfigSet):
|
||||
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
|
||||
# must be a short path due to unix path length restrictions for socket files (<100 chars)
|
||||
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
|
||||
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
|
||||
|
||||
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
|
||||
# must be able to contain executable binaries (up to 5GB size)
|
||||
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
|
||||
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
|
||||
|
||||
OUTPUT_PERMISSIONS: str = Field(default='644')
|
||||
RESTRICT_FILE_NAMES: str = Field(default='windows')
|
||||
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
@ -97,14 +96,10 @@ class ConstantsDict(Mapping):
|
|||
|
||||
# Runtime dirs
|
||||
TMP_DIR_NAME: str = 'tmp'
|
||||
TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID
|
||||
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
|
||||
|
||||
LIB_DIR_NAME: str = 'lib'
|
||||
LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE
|
||||
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
|
||||
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
|
||||
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
|
||||
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
|
||||
BIN_DIR: Path = LIB_BIN_DIR
|
||||
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
|
||||
|
||||
# Config constants
|
||||
TIMEZONE: str = 'UTC'
|
||||
|
@ -198,91 +193,7 @@ class ConstantsDict(Mapping):
|
|||
".archivebox_id",
|
||||
"Dockerfile",
|
||||
))
|
||||
|
||||
CODE_LOCATIONS = benedict({
|
||||
'PACKAGE_DIR': {
|
||||
'path': (PACKAGE_DIR).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': TEMPLATES_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': CUSTOM_TEMPLATES_DIR.resolve(),
|
||||
'enabled': os.path.isdir(CUSTOM_TEMPLATES_DIR),
|
||||
'is_valid': os.path.isdir(CUSTOM_TEMPLATES_DIR) and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK), # read
|
||||
},
|
||||
'USER_PLUGINS_DIR': {
|
||||
'path': USER_PLUGINS_DIR.resolve(),
|
||||
'enabled': os.path.isdir(USER_PLUGINS_DIR),
|
||||
'is_valid': os.path.isdir(USER_PLUGINS_DIR) and os.access(USER_PLUGINS_DIR, os.R_OK), # read
|
||||
},
|
||||
'LIB_DIR': {
|
||||
'path': LIB_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(LIB_DIR) and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.W_OK), # read + write
|
||||
},
|
||||
})
|
||||
|
||||
DATA_LOCATIONS = benedict({
|
||||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": CONFIG_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(CONFIG_FILE) and os.access(CONFIG_FILE, os.R_OK) and os.access(CONFIG_FILE, os.W_OK),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||
},
|
||||
"QUEUE_DATABASE": {
|
||||
"path": QUEUE_DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(QUEUE_DATABASE_FILE) and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": ARCHIVE_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": SOURCES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(SOURCES_DIR) and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK),
|
||||
},
|
||||
"PERSONAS_DIR": {
|
||||
"path": PERSONAS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(PERSONAS_DIR),
|
||||
"is_valid": os.path.isdir(PERSONAS_DIR) and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": LOGS_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(LOGS_DIR) and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
'TMP_DIR': {
|
||||
'path': TMP_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(TMP_DIR) and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.W_OK), # read + write
|
||||
},
|
||||
# "CACHE_DIR": {
|
||||
# "path": CACHE_DIR.resolve(),
|
||||
# "enabled": True,
|
||||
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
|
||||
# },
|
||||
})
|
||||
|
||||
@classmethod
|
||||
def __getitem__(cls, key: str):
|
||||
|
|
|
@ -258,6 +258,9 @@ def load_config_val(key: str,
|
|||
|
||||
elif type is list or type is dict:
|
||||
return json.loads(val)
|
||||
|
||||
elif type is Path:
|
||||
return Path(val)
|
||||
|
||||
raise Exception('Config values can only be str, bool, int, or json')
|
||||
|
||||
|
@ -574,7 +577,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
|
|||
with SudoPermission(uid=0):
|
||||
# running as root is a special case where it's ok to be a bit slower
|
||||
# make sure data dir is always owned by the correct user
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"')
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
|
||||
|
||||
bump_startup_progress_bar()
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import socket
|
||||
import hashlib
|
||||
import tempfile
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from functools import cache
|
||||
from datetime import datetime
|
||||
|
||||
from benedict import benedict
|
||||
|
||||
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
|
||||
|
||||
#############################################################################################
|
||||
|
@ -88,7 +92,7 @@ def get_machine_type() -> str:
|
|||
return LIB_DIR_SCOPE
|
||||
|
||||
|
||||
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
|
||||
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool:
|
||||
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
|
||||
current_uid, current_gid = os.geteuid(), os.getegid()
|
||||
uid, gid = uid or current_uid, gid or current_gid
|
||||
|
@ -101,10 +105,197 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No
|
|||
test_file.unlink()
|
||||
return True
|
||||
except (IOError, OSError, PermissionError):
|
||||
pass
|
||||
|
||||
if chown:
|
||||
# try fixing it using sudo permissions
|
||||
with SudoPermission(uid=uid, fallback=fallback):
|
||||
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
|
||||
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
|
||||
return False
|
||||
|
||||
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
|
||||
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
|
||||
from archivebox.logging_util import pretty_path
|
||||
|
||||
try:
|
||||
socket_path = str(dir_path / '.test_socket.sock')
|
||||
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
try:
|
||||
os.remove(socket_path)
|
||||
except OSError:
|
||||
pass
|
||||
s.bind(socket_path)
|
||||
s.close()
|
||||
try:
|
||||
os.remove(socket_path)
|
||||
except OSError:
|
||||
pass
|
||||
except Exception as e:
|
||||
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def create_and_chown_dir(dir_path: Path) -> None:
|
||||
with SudoPermission(uid=0, fallback=True):
|
||||
dir_path.mkdir(parents=True, exist_ok=True)
|
||||
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null')
|
||||
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
|
||||
|
||||
@cache
|
||||
def get_or_create_working_tmp_dir(autofix=True, quiet=False):
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.misc.checks import check_tmp_dir
|
||||
|
||||
# try a few potential directories in order of preference
|
||||
CANDIDATES = [
|
||||
STORAGE_CONFIG.TMP_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
|
||||
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
|
||||
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
|
||||
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
|
||||
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
|
||||
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
|
||||
]
|
||||
for candidate in CANDIDATES:
|
||||
try:
|
||||
create_and_chown_dir(candidate)
|
||||
except Exception:
|
||||
pass
|
||||
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
|
||||
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
|
||||
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet)
|
||||
return candidate
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
|
||||
|
||||
@cache
|
||||
def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.misc.checks import check_lib_dir
|
||||
|
||||
# try a few potential directories in order of preference
|
||||
CANDIDATES = [
|
||||
STORAGE_CONFIG.LIB_DIR, # <user-specified>
|
||||
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
|
||||
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
|
||||
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
|
||||
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
|
||||
]
|
||||
|
||||
for candidate in CANDIDATES:
|
||||
try:
|
||||
create_and_chown_dir(candidate)
|
||||
except Exception:
|
||||
pass
|
||||
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
|
||||
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
|
||||
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet)
|
||||
return candidate
|
||||
|
||||
if not quiet:
|
||||
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
|
||||
|
||||
|
||||
|
||||
@cache
|
||||
def get_data_locations():
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
return benedict({
|
||||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": CONSTANTS.CONFIG_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||
},
|
||||
"QUEUE_DATABASE": {
|
||||
"path": CONSTANTS.QUEUE_DATABASE_FILE,
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": ARCHIVE_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
|
||||
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": CONSTANTS.SOURCES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
|
||||
},
|
||||
"PERSONAS_DIR": {
|
||||
"path": CONSTANTS.PERSONAS_DIR.resolve(),
|
||||
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
|
||||
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": CONSTANTS.LOGS_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
|
||||
},
|
||||
'TMP_DIR': {
|
||||
'path': STORAGE_CONFIG.TMP_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write
|
||||
},
|
||||
# "CACHE_DIR": {
|
||||
# "path": CACHE_DIR.resolve(),
|
||||
# "enabled": True,
|
||||
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
|
||||
# },
|
||||
})
|
||||
|
||||
@cache
|
||||
def get_code_locations():
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
return benedict({
|
||||
'PACKAGE_DIR': {
|
||||
'path': (PACKAGE_DIR).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(),
|
||||
'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR),
|
||||
'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
|
||||
},
|
||||
'USER_PLUGINS_DIR': {
|
||||
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
|
||||
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
|
||||
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
|
||||
},
|
||||
'LIB_DIR': {
|
||||
'path': STORAGE_CONFIG.LIB_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
|
||||
# @cache
|
||||
|
|
|
@ -510,7 +510,7 @@ def log_removal_finished(all_links: int, to_remove: int):
|
|||
### Helpers
|
||||
|
||||
@enforce_types
|
||||
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str:
|
||||
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str:
|
||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||
pwd = str(Path(pwd)) # .resolve()
|
||||
path = str(path)
|
||||
|
@ -520,7 +520,10 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str:
|
|||
|
||||
# replace long absolute paths with ./ relative ones to save on terminal output width
|
||||
if path.startswith(pwd) and (pwd != '/') and path != pwd:
|
||||
path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
|
||||
if color:
|
||||
path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
|
||||
else:
|
||||
path = path.replace(pwd, '.', 1)
|
||||
|
||||
# quote paths containing spaces
|
||||
if ' ' in path:
|
||||
|
|
|
@ -189,6 +189,7 @@ def version(quiet: bool=False,
|
|||
if quiet or '--version' in sys.argv:
|
||||
return
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.console import Console
|
||||
console = Console()
|
||||
prnt = console.print
|
||||
|
@ -197,6 +198,7 @@ def version(quiet: bool=False,
|
|||
from django.conf import settings
|
||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
|
||||
from archivebox.config.paths import get_data_locations, get_code_locations
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, apt, brew, env
|
||||
|
||||
|
@ -221,7 +223,7 @@ def version(quiet: bool=False,
|
|||
f'PLATFORM={platform.platform()}',
|
||||
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
|
||||
)
|
||||
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
|
||||
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
|
@ -240,6 +242,21 @@ def version(quiet: bool=False,
|
|||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
)
|
||||
prnt()
|
||||
|
||||
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
|
||||
PANEL_TEXT = '\n'.join((
|
||||
# '',
|
||||
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
|
||||
'',
|
||||
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
|
||||
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
|
||||
'',
|
||||
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
|
||||
'',
|
||||
))
|
||||
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
prnt()
|
||||
return
|
||||
|
||||
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||
failures = []
|
||||
|
@ -299,13 +316,13 @@ def version(quiet: bool=False,
|
|||
|
||||
prnt()
|
||||
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
|
||||
for name, path in CONSTANTS.CODE_LOCATIONS.items():
|
||||
for name, path in get_code_locations().items():
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
|
||||
prnt()
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
|
||||
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
|
||||
for name, path in CONSTANTS.DATA_LOCATIONS.items():
|
||||
for name, path in get_data_locations().items():
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
|
||||
from archivebox.misc.checks import check_data_dir_permissions
|
||||
|
@ -395,7 +412,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
|
|||
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
|
||||
|
||||
# from django.contrib.auth.models import User
|
||||
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
||||
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
# call_command("createsuperuser", interactive=True)
|
||||
|
||||
|
@ -486,9 +503,13 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
|
|||
html_index.rename(f"{index_name}.html")
|
||||
|
||||
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if install:
|
||||
run_subcommand('install', pwd=out_dir)
|
||||
|
||||
|
@ -1115,7 +1136,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
|||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exists():
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||
stderr(' archivebox manage createsuperuser')
|
||||
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||
|
@ -1399,46 +1420,43 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
from django.core.management import call_command
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
print()
|
||||
# print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]')
|
||||
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print()
|
||||
|
||||
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(' > Logging errors to ./logs/errors.log')
|
||||
if not User.objects.filter(is_superuser=True).exists():
|
||||
print('[yellow][!] No admin users exist yet, you will not be able to edit links in the UI.[/yellow]')
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To create an admin user, run:')
|
||||
print(' archivebox manage createsuperuser')
|
||||
print()
|
||||
host = '127.0.0.1'
|
||||
port = '8000'
|
||||
|
||||
try:
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
|
||||
if ':' in host_and_port:
|
||||
host, port = host_and_port.split(':')
|
||||
else:
|
||||
if '.' in host_and_port:
|
||||
host = host_and_port
|
||||
else:
|
||||
port = host_and_port
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
|
||||
if SHELL_CONFIG.DEBUG:
|
||||
if not reload:
|
||||
runserver_args.append('--noreload') # '--insecure'
|
||||
call_command("runserver", *runserver_args)
|
||||
else:
|
||||
host = '127.0.0.1'
|
||||
port = '8000'
|
||||
|
||||
try:
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
|
||||
if ':' in host_and_port:
|
||||
host, port = host_and_port.split(':')
|
||||
else:
|
||||
if '.' in host_and_port:
|
||||
host = host_and_port
|
||||
else:
|
||||
port = host_and_port
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
|
||||
from queues.supervisor_util import start_server_workers
|
||||
|
||||
print()
|
||||
|
||||
start_server_workers(host=host, port=port, daemonize=False)
|
||||
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
|
||||
|
|
|
@ -5,16 +5,24 @@ import sys
|
|||
from pathlib import Path
|
||||
|
||||
from rich import print
|
||||
from rich.panel import Panel
|
||||
|
||||
# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE
|
||||
# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries
|
||||
# this file is imported by archivebox/__init__.py
|
||||
# and any imports here will be imported by EVERYTHING else
|
||||
# so this file should only be used for pure python checks
|
||||
# that don't need to import other parts of ArchiveBox
|
||||
|
||||
# if a check needs to import other parts of ArchiveBox,
|
||||
# the imports should be done inside the check function
|
||||
# and you should make sure if you need to import any django stuff
|
||||
# that the check is called after django.setup() has been called
|
||||
|
||||
|
||||
def check_data_folder() -> None:
|
||||
from archivebox import DATA_DIR, ARCHIVE_DIR
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||
|
||||
archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()
|
||||
if not archive_dir_exists:
|
||||
|
@ -30,8 +38,21 @@ def check_data_folder() -> None:
|
|||
raise SystemExit(2)
|
||||
|
||||
|
||||
# Create data dir subdirs
|
||||
create_and_chown_dir(CONSTANTS.SOURCES_DIR)
|
||||
create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default')
|
||||
create_and_chown_dir(CONSTANTS.LOGS_DIR)
|
||||
# create_and_chown_dir(CONSTANTS.CACHE_DIR)
|
||||
|
||||
# Create /tmp and /lib dirs if they don't exist
|
||||
get_or_create_working_tmp_dir(autofix=True, quiet=False)
|
||||
get_or_create_working_lib_dir(autofix=True, quiet=False)
|
||||
|
||||
# Check data dir permissions, /tmp, and /lib permissions
|
||||
check_data_dir_permissions()
|
||||
|
||||
def check_migrations():
|
||||
from archivebox import DATA_DIR, CONSTANTS
|
||||
from archivebox import DATA_DIR
|
||||
from ..index.sql import list_migrations
|
||||
|
||||
pending_migrations = [name for status, name in list_migrations() if not status]
|
||||
|
@ -45,13 +66,6 @@ def check_migrations():
|
|||
print(' archivebox init', file=sys.stderr)
|
||||
raise SystemExit(3)
|
||||
|
||||
CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
|
||||
CONSTANTS.LOGS_DIR.mkdir(exist_ok=True)
|
||||
# CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
|
||||
(CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
|
||||
(CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def check_io_encoding():
|
||||
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
|
||||
|
||||
|
@ -128,3 +142,98 @@ def check_data_dir_permissions():
|
|||
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
|
||||
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
|
||||
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
# Check /tmp dir permissions
|
||||
check_tmp_dir(STORAGE_CONFIG.TMP_DIR, throw=False, must_exist=True)
|
||||
|
||||
# Check /lib dir permissions
|
||||
check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True)
|
||||
|
||||
|
||||
def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
|
||||
from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.logging_util import pretty_path
|
||||
|
||||
tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR
|
||||
socket_file = tmp_dir.absolute().resolve() / "supervisord.sock"
|
||||
|
||||
if not must_exist and not os.path.isdir(tmp_dir):
|
||||
# just check that its viable based on its length (because dir may not exist yet, we cant check if its writable)
|
||||
return len(f'file://{socket_file}') <= 96
|
||||
|
||||
tmp_is_valid = False
|
||||
try:
|
||||
tmp_is_valid = dir_is_writable(tmp_dir)
|
||||
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
|
||||
assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
|
||||
assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
|
||||
return True
|
||||
except Exception as e:
|
||||
if not quiet:
|
||||
STDERR.print()
|
||||
ERROR_TEXT = '\n'.join((
|
||||
'',
|
||||
f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]',
|
||||
f' [yellow]{e}[/yellow]',
|
||||
'',
|
||||
'[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.',
|
||||
' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).',
|
||||
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
|
||||
' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.',
|
||||
' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]',
|
||||
'',
|
||||
'[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:',
|
||||
f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]',
|
||||
'',
|
||||
))
|
||||
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.'))
|
||||
STDERR.print()
|
||||
if throw:
|
||||
raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e
|
||||
return False
|
||||
|
||||
|
||||
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.logging_util import pretty_path
|
||||
|
||||
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
|
||||
|
||||
if not must_exist and not os.path.isdir(lib_dir):
|
||||
return True
|
||||
|
||||
lib_is_valid = False
|
||||
try:
|
||||
lib_is_valid = dir_is_writable(lib_dir)
|
||||
assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}'
|
||||
return True
|
||||
except Exception as e:
|
||||
if not quiet:
|
||||
STDERR.print()
|
||||
ERROR_TEXT = '\n'.join((
|
||||
'',
|
||||
f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]',
|
||||
f' [yellow]{e}[/yellow]',
|
||||
'',
|
||||
'[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.',
|
||||
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
|
||||
' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).',
|
||||
' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]',
|
||||
'',
|
||||
'[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:',
|
||||
f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]',
|
||||
'',
|
||||
))
|
||||
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]'))
|
||||
STDERR.print()
|
||||
if throw:
|
||||
raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e
|
||||
return False
|
||||
|
|
|
@ -49,7 +49,7 @@ if __name__ == '__main__':
|
|||
|
||||
prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!')
|
||||
prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]')
|
||||
prnt(' [link=https://docs.archivebox.io/en/latest/modules.html]https://docs.archivebox.io/en/latest/modules.html[/link]')
|
||||
prnt(' [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]')
|
||||
prnt()
|
||||
prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]')
|
||||
prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]')
|
||||
|
|
|
@ -82,10 +82,10 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
|
|||
if isinstance(default, Callable):
|
||||
return '{{lambda ' + inspect.getsource(default).split('=lambda ')[-1].strip()[:-1] + '}}'
|
||||
return to_jsonable_python(
|
||||
default,
|
||||
timedelta_mode=config.ser_json_timedelta,
|
||||
bytes_mode=config.ser_json_bytes,
|
||||
serialize_unknown=True
|
||||
default,
|
||||
timedelta_mode=config.ser_json_timedelta,
|
||||
bytes_mode=config.ser_json_bytes,
|
||||
serialize_unknown=True
|
||||
)
|
||||
|
||||
# for computed_field properties render them like this instead:
|
||||
|
|
|
@ -104,7 +104,10 @@ class ChromeBinary(BaseBinary):
|
|||
}
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
|
||||
|
||||
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
||||
return
|
||||
|
||||
|
|
|
@ -3,8 +3,6 @@ __package__ = 'plugins_pkg.npm'
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import model_validator
|
||||
|
||||
from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
|
||||
|
||||
from archivebox.config import DATA_DIR, CONSTANTS
|
||||
|
@ -14,7 +12,7 @@ from abx.archivebox.base_binary import BaseBinProvider
|
|||
|
||||
|
||||
OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
|
||||
NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
|
||||
NEW_NODE_BIN_PATH = CONSTANTS.DEFAULT_LIB_DIR / 'npm' / 'node_modules' / '.bin'
|
||||
|
||||
|
||||
class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
|
||||
|
@ -27,12 +25,16 @@ class LibNpmBinProvider(NpmProvider, BaseBinProvider):
|
|||
name: BinProviderName = "lib_npm"
|
||||
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
|
||||
|
||||
npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
|
||||
npm_prefix: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'npm'
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_path(self):
|
||||
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
|
||||
return self
|
||||
def setup(self) -> None:
|
||||
# update paths from config if they arent the default
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR:
|
||||
self.npm_prefix = STORAGE_CONFIG.LIB_DIR / 'npm'
|
||||
self.PATH = f'{STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
|
||||
|
||||
super().setup()
|
||||
|
||||
|
||||
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
|
||||
|
|
|
@ -49,7 +49,15 @@ class LibPipBinProvider(PipProvider, BaseBinProvider):
|
|||
name: BinProviderName = "lib_pip"
|
||||
INSTALLER_BIN: BinName = "pip"
|
||||
|
||||
pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
|
||||
pip_venv: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'pip' / 'venv'
|
||||
|
||||
def setup(self) -> None:
|
||||
# update paths from config if they arent the default
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR:
|
||||
self.pip_venv = STORAGE_CONFIG.LIB_DIR / 'pip' / 'venv'
|
||||
|
||||
super().setup()
|
||||
|
||||
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
|
||||
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
|
||||
|
|
|
@ -35,7 +35,7 @@ class PlaywrightBinProvider(BaseBinProvider):
|
|||
name: BinProviderName = "playwright"
|
||||
INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
|
||||
|
||||
PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
|
||||
PATH: PATHStr = f"{CONSTANTS.DEFAULT_LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}"
|
||||
|
||||
playwright_browsers_dir: Path = (
|
||||
MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
|
||||
|
@ -56,6 +56,11 @@ class PlaywrightBinProvider(BaseBinProvider):
|
|||
return PLAYWRIGHT_BINARY.load().abspath
|
||||
|
||||
def setup(self) -> None:
|
||||
# update paths from config if they arent the default
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR:
|
||||
self.PATH = f"{STORAGE_CONFIG.LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}"
|
||||
|
||||
assert SYS_PIP_BINPROVIDER.INSTALLER_BIN_ABSPATH, "Pip bin provider not initialized"
|
||||
|
||||
if self.playwright_browsers_dir:
|
||||
|
|
|
@ -23,19 +23,16 @@ from abx.archivebox.base_binary import BaseBinProvider
|
|||
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER
|
||||
|
||||
|
||||
LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
|
||||
|
||||
|
||||
class PuppeteerBinProvider(BaseBinProvider):
|
||||
name: BinProviderName = "puppeteer"
|
||||
INSTALLER_BIN: BinName = "npx"
|
||||
|
||||
PATH: PATHStr = str(CONSTANTS.LIB_BIN_DIR)
|
||||
PATH: PATHStr = str(CONSTANTS.DEFAULT_LIB_DIR / 'bin')
|
||||
|
||||
euid: Optional[int] = ARCHIVEBOX_USER
|
||||
|
||||
puppeteer_browsers_dir: Path = LIB_DIR_BROWSERS
|
||||
puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
|
||||
puppeteer_browsers_dir: Path = CONSTANTS.DEFAULT_LIB_DIR / 'browsers'
|
||||
puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install"]
|
||||
|
||||
packages_handler: BinProviderOverrides = Field(default={
|
||||
"chrome": lambda:
|
||||
|
@ -45,6 +42,11 @@ class PuppeteerBinProvider(BaseBinProvider):
|
|||
_browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
|
||||
|
||||
def setup(self) -> None:
|
||||
# update paths from config
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
self.puppeteer_browsers_dir = STORAGE_CONFIG.LIB_DIR / 'browsers'
|
||||
self.PATH = str(STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
|
||||
assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized"
|
||||
|
||||
if self.puppeteer_browsers_dir:
|
||||
|
@ -90,7 +92,7 @@ class PuppeteerBinProvider(BaseBinProvider):
|
|||
|
||||
# print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
|
||||
|
||||
install_args = [*self.puppeteer_install_args]
|
||||
install_args = [*self.puppeteer_install_args, "--path", str(self.puppeteer_browsers_dir)]
|
||||
|
||||
proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages])
|
||||
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
import tempfile
|
||||
from pathlib import Path
|
||||
from functools import cache
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.paths import get_collection_id
|
||||
|
||||
DATA_DIR = CONSTANTS.DATA_DIR
|
||||
LOGS_DIR = CONSTANTS.LOGS_DIR
|
||||
TMP_DIR = CONSTANTS.TMP_DIR
|
||||
|
||||
SUPERVISORD_CONFIG_FILE = TMP_DIR / "supervisord.conf"
|
||||
PID_FILE = TMP_DIR / "supervisord.pid"
|
||||
SOCK_FILE = TMP_DIR / "supervisord.sock"
|
||||
LOG_FILE = TMP_DIR / "supervisord.log"
|
||||
WORKERS_DIR = TMP_DIR / "workers"
|
||||
|
||||
@cache
|
||||
def get_sock_file():
|
||||
"""Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits"""
|
||||
TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if len(f'file://{SOCK_FILE.absolute().resolve()}') > 98:
|
||||
# socket absolute paths cannot be longer than 104 bytes on macos, and 108 bytes on linux
|
||||
# symlink it to a shorter path and use that instead
|
||||
|
||||
# place the actual socket file in a shorter tmp dir
|
||||
# /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox_supervisord_3d1e544e.sock
|
||||
shorter_sock_file = Path(tempfile.gettempdir()) / f"archivebox_supervisord_{get_collection_id()}.sock"
|
||||
|
||||
# symlink ./data/tmp/<collection_id>/supervisord.sock -> /var/folders/qy/abc234235/T/archivebox_supervisord_3d1e544e.sock
|
||||
# for convenience/consistency
|
||||
symlink = SOCK_FILE
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(shorter_sock_file)
|
||||
|
||||
assert len(f'file://{shorter_sock_file}') <= 98, f'Failed to create supervisord SOCK_FILE, system tmp dir location is too long {shorter_sock_file} (unix only allows 108 characters for socket paths)'
|
||||
return shorter_sock_file
|
||||
|
||||
return SOCK_FILE
|
|
@ -1,23 +1,39 @@
|
|||
__package__ = 'archivebox.queues'
|
||||
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
import psutil
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
from typing import Dict, cast, Iterator
|
||||
from pathlib import Path
|
||||
from functools import cache
|
||||
|
||||
from rich import print
|
||||
|
||||
from typing import Dict, cast
|
||||
|
||||
from supervisor.xmlrpc import SupervisorTransport
|
||||
from xmlrpc.client import ServerProxy
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.paths import get_or_create_working_tmp_dir
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.logging_util import pretty_path
|
||||
|
||||
from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, get_sock_file, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR
|
||||
LOG_FILE_NAME = "supervisord.log"
|
||||
CONFIG_FILE_NAME = "supervisord.conf"
|
||||
PID_FILE_NAME = "supervisord.pid"
|
||||
WORKERS_DIR_NAME = "workers"
|
||||
|
||||
from typing import Iterator
|
||||
@cache
|
||||
def get_sock_file():
|
||||
"""Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits"""
|
||||
TMP_DIR = get_or_create_working_tmp_dir(autofix=True, quiet=False)
|
||||
assert TMP_DIR, "Failed to find or create a writable TMP_DIR!"
|
||||
socket_file = TMP_DIR / "supervisord.sock"
|
||||
|
||||
return socket_file
|
||||
|
||||
def follow(file, sleep_sec=0.1) -> Iterator[str]:
|
||||
""" Yield each line from a file as they are written.
|
||||
|
@ -35,24 +51,30 @@ def follow(file, sleep_sec=0.1) -> Iterator[str]:
|
|||
|
||||
|
||||
def create_supervisord_config():
|
||||
SOCK_FILE = get_sock_file()
|
||||
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
|
||||
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME
|
||||
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
|
||||
LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME
|
||||
|
||||
config_content = f"""
|
||||
[supervisord]
|
||||
nodaemon = true
|
||||
environment = IS_SUPERVISORD_PARENT="true"
|
||||
pidfile = {TMP_DIR}/{PID_FILE.name}
|
||||
logfile = {LOGS_DIR}/{LOG_FILE.name}
|
||||
childlogdir = {LOGS_DIR}
|
||||
directory = {DATA_DIR}
|
||||
pidfile = {PID_FILE}
|
||||
logfile = {LOG_FILE}
|
||||
childlogdir = {CONSTANTS.LOGS_DIR}
|
||||
directory = {CONSTANTS.DATA_DIR}
|
||||
strip_ansi = true
|
||||
nocleanup = true
|
||||
user = {ARCHIVEBOX_USER}
|
||||
|
||||
[unix_http_server]
|
||||
file = {get_sock_file()}
|
||||
file = {SOCK_FILE}
|
||||
chmod = 0700
|
||||
|
||||
[supervisorctl]
|
||||
serverurl = unix://{get_sock_file()}
|
||||
serverurl = unix://{SOCK_FILE}
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
|
@ -61,9 +83,14 @@ supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
|||
files = {WORKERS_DIR}/*.conf
|
||||
|
||||
"""
|
||||
SUPERVISORD_CONFIG_FILE.write_text(config_content)
|
||||
CONFIG_FILE.write_text(config_content)
|
||||
Path.mkdir(WORKERS_DIR, exist_ok=True)
|
||||
(WORKERS_DIR / 'initial_startup.conf').write_text('') # hides error about "no files found to include" when supervisord starts
|
||||
|
||||
def create_worker_config(daemon):
|
||||
SOCK_FILE = get_sock_file()
|
||||
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
|
||||
|
||||
Path.mkdir(WORKERS_DIR, exist_ok=True)
|
||||
|
||||
name = daemon['name']
|
||||
|
@ -80,13 +107,14 @@ def create_worker_config(daemon):
|
|||
|
||||
|
||||
def get_existing_supervisord_process():
|
||||
SOCK_FILE = get_sock_file()
|
||||
try:
|
||||
transport = SupervisorTransport(None, None, f"unix://{get_sock_file()}")
|
||||
transport = SupervisorTransport(None, None, f"unix://{SOCK_FILE}")
|
||||
server = ServerProxy("http://localhost", transport=transport)
|
||||
current_state = cast(Dict[str, int | str], server.supervisor.getState())
|
||||
if current_state["statename"] == "RUNNING":
|
||||
pid = server.supervisor.getPID()
|
||||
print(f"[🦸♂️] Supervisord connected (pid={pid}) via unix://{str(get_sock_file()).replace(str(DATA_DIR), '.')}.")
|
||||
print(f"[🦸♂️] Supervisord connected (pid={pid}) via unix://{pretty_path(SOCK_FILE)}.")
|
||||
return server.supervisor
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
|
@ -95,58 +123,83 @@ def get_existing_supervisord_process():
|
|||
return None
|
||||
|
||||
def stop_existing_supervisord_process():
|
||||
SOCK_FILE = get_sock_file()
|
||||
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
|
||||
|
||||
try:
|
||||
pid = int(PID_FILE.read_text())
|
||||
except FileNotFoundError:
|
||||
return
|
||||
except ValueError:
|
||||
PID_FILE.unlink()
|
||||
return
|
||||
try:
|
||||
pid = int(PID_FILE.read_text())
|
||||
except (FileNotFoundError, ValueError):
|
||||
return
|
||||
|
||||
try:
|
||||
print(f"[🦸♂️] Stopping supervisord process (pid={pid})...")
|
||||
proc = psutil.Process(pid)
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
PID_FILE.unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
try:
|
||||
print(f"[🦸♂️] Stopping supervisord process (pid={pid})...")
|
||||
proc = psutil.Process(pid)
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
except (Exception, BrokenPipeError, IOError):
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
# clear PID file and socket file
|
||||
PID_FILE.unlink(missing_ok=True)
|
||||
get_sock_file().unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def start_new_supervisord_process(daemonize=False):
|
||||
SOCK_FILE = get_sock_file()
|
||||
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
|
||||
LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME
|
||||
CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME
|
||||
PID_FILE = SOCK_FILE.parent / PID_FILE_NAME
|
||||
|
||||
print(f"[🦸♂️] Supervisord starting{' in background' if daemonize else ''}...")
|
||||
# Create a config file in the current working directory
|
||||
pretty_log_path = pretty_path(LOG_FILE)
|
||||
print(f" > Writing supervisord logs to: {pretty_log_path}")
|
||||
print(f" > Writing task worker logs to: {pretty_log_path.replace('supervisord.log', 'worker_*.log')}")
|
||||
print(f' > Using supervisord config file: {pretty_path(CONFIG_FILE)}')
|
||||
print(f" > Using supervisord UNIX socket: {pretty_path(SOCK_FILE)}")
|
||||
print()
|
||||
|
||||
# clear out existing stale state files
|
||||
shutil.rmtree(WORKERS_DIR, ignore_errors=True)
|
||||
PID_FILE.unlink(missing_ok=True)
|
||||
get_sock_file().unlink(missing_ok=True)
|
||||
SUPERVISORD_CONFIG_FILE.unlink(missing_ok=True)
|
||||
CONFIG_FILE.unlink(missing_ok=True)
|
||||
|
||||
# create the supervisord config file
|
||||
create_supervisord_config()
|
||||
|
||||
# Start supervisord
|
||||
# panel = Panel(f"Starting supervisord with config: {SUPERVISORD_CONFIG_FILE}")
|
||||
# with Live(panel, refresh_per_second=1) as live:
|
||||
|
||||
subprocess.Popen(
|
||||
f"supervisord --configuration={SUPERVISORD_CONFIG_FILE}",
|
||||
f"supervisord --configuration={CONFIG_FILE}",
|
||||
stdin=None,
|
||||
shell=True,
|
||||
start_new_session=daemonize,
|
||||
)
|
||||
|
||||
def exit_signal_handler(signum, frame):
|
||||
if signum != 13:
|
||||
print(f"\n[🦸♂️] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...")
|
||||
if signum == 2:
|
||||
STDERR.print("\n[🛑] Got Ctrl+C. Terminating child processes...")
|
||||
elif signum != 13:
|
||||
STDERR.print(f"\n[🦸♂️] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...")
|
||||
stop_existing_supervisord_process()
|
||||
raise SystemExit(0)
|
||||
|
||||
# Monitor for termination signals and cleanup child processes
|
||||
if not daemonize:
|
||||
signal.signal(signal.SIGINT, exit_signal_handler)
|
||||
signal.signal(signal.SIGHUP, exit_signal_handler)
|
||||
signal.signal(signal.SIGPIPE, exit_signal_handler)
|
||||
signal.signal(signal.SIGTERM, exit_signal_handler)
|
||||
try:
|
||||
signal.signal(signal.SIGINT, exit_signal_handler)
|
||||
signal.signal(signal.SIGHUP, exit_signal_handler)
|
||||
signal.signal(signal.SIGPIPE, exit_signal_handler)
|
||||
signal.signal(signal.SIGTERM, exit_signal_handler)
|
||||
except Exception:
|
||||
# signal handlers only work in main thread
|
||||
pass
|
||||
# otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode)
|
||||
|
||||
time.sleep(2)
|
||||
|
@ -154,14 +207,32 @@ def start_new_supervisord_process(daemonize=False):
|
|||
return get_existing_supervisord_process()
|
||||
|
||||
def get_or_create_supervisord_process(daemonize=False):
|
||||
SOCK_FILE = get_sock_file()
|
||||
WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME
|
||||
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor is None:
|
||||
stop_existing_supervisord_process()
|
||||
supervisor = start_new_supervisord_process(daemonize=daemonize)
|
||||
time.sleep(0.5)
|
||||
|
||||
# wait up to 5s in case supervisord is slow to start
|
||||
if not supervisor:
|
||||
for _ in range(10):
|
||||
if supervisor is not None:
|
||||
print()
|
||||
break
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
time.sleep(0.5)
|
||||
supervisor = get_existing_supervisord_process()
|
||||
else:
|
||||
print()
|
||||
|
||||
assert supervisor, "Failed to start supervisord or connect to it!"
|
||||
supervisor.getPID() # make sure it doesn't throw an exception
|
||||
|
||||
(WORKERS_DIR / 'initial_startup.conf').unlink(missing_ok=True)
|
||||
|
||||
return supervisor
|
||||
|
||||
|
@ -242,9 +313,9 @@ def tail_worker_logs(log_path: str):
|
|||
for line in follow(f):
|
||||
if '://' in line:
|
||||
live.console.print(f"Working on: {line.strip()}")
|
||||
table.add_row("123124234", line.strip())
|
||||
except KeyboardInterrupt:
|
||||
print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
# table.add_row("123124234", line.strip())
|
||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
except SystemExit:
|
||||
pass
|
||||
|
||||
|
@ -321,12 +392,12 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
|||
if not daemonize:
|
||||
try:
|
||||
watch_worker(supervisor, "worker_daphne")
|
||||
except KeyboardInterrupt:
|
||||
print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
except SystemExit:
|
||||
pass
|
||||
except BaseException as e:
|
||||
print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
|
||||
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
|
||||
raise
|
||||
finally:
|
||||
stop_worker(supervisor, "worker_daphne")
|
||||
|
@ -350,12 +421,12 @@ def start_cli_workers(watch=False):
|
|||
if watch:
|
||||
try:
|
||||
watch_worker(supervisor, "worker_system_tasks")
|
||||
except KeyboardInterrupt:
|
||||
print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
except SystemExit:
|
||||
pass
|
||||
except BaseException as e:
|
||||
print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
|
||||
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
|
||||
raise
|
||||
finally:
|
||||
stop_worker(supervisor, "worker_system_tasks")
|
||||
|
|
Loading…
Reference in a new issue