rename configfile to collection

This commit is contained in:
Nick Sweeting 2024-10-24 15:40:24 -07:00
parent 63bf902f35
commit 60f0458c77
No known key found for this signature in database
9 changed files with 41 additions and 37 deletions

View file

@ -14,7 +14,6 @@ from pydantic_pkgr import (
EnvProvider, EnvProvider,
) )
from archivebox.config import CONSTANTS
from archivebox.config.permissions import ARCHIVEBOX_USER from archivebox.config.permissions import ARCHIVEBOX_USER
import abx import abx
@ -34,6 +33,7 @@ class BaseBinProvider(BinProvider):
return [self] return [self]
class BaseBinary(Binary): class BaseBinary(Binary):
# TODO: formalize state diagram, final states, transitions, side effects, etc.
@staticmethod @staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None: def symlink_to_lib(binary, bin_dir=None) -> None:

View file

@ -99,7 +99,7 @@ class BaseConfigSet(BaseSettings):
) )
load_from_defaults: ClassVar[bool] = True load_from_defaults: ClassVar[bool] = True
load_from_configfile: ClassVar[bool] = True load_from_collection: ClassVar[bool] = True
load_from_environment: ClassVar[bool] = True load_from_environment: ClassVar[bool] = True
@classmethod @classmethod
@ -128,7 +128,8 @@ class BaseConfigSet(BaseSettings):
try: try:
precedence_order = precedence_order or { precedence_order = precedence_order or {
'defaults': init_settings, 'defaults': init_settings,
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
'environment': env_settings, 'environment': env_settings,
} }
except Exception as err: except Exception as err:
@ -144,14 +145,15 @@ class BaseConfigSet(BaseSettings):
precedence_order = { precedence_order = {
'defaults': init_settings, 'defaults': init_settings,
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
'environment': env_settings, 'environment': env_settings,
} }
if not cls.load_from_environment: if not cls.load_from_environment:
precedence_order.pop('environment') precedence_order.pop('environment')
if not cls.load_from_configfile: if not cls.load_from_collection:
precedence_order.pop('configfile') precedence_order.pop('collection')
if not cls.load_from_defaults: if not cls.load_from_defaults:
precedence_order.pop('defaults') precedence_order.pop('defaults')
@ -278,15 +280,15 @@ class BaseConfigSet(BaseSettings):
"""Get the dictionary of {key: value} config loaded from the default values""" """Get the dictionary of {key: value} config loaded from the default values"""
class OnlyDefaultsConfig(self.__class__): class OnlyDefaultsConfig(self.__class__):
load_from_defaults = True load_from_defaults = True
load_from_configfile = False load_from_collection = False
load_from_environment = False load_from_environment = False
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_configfile(self) -> Dict[str, Any]: def from_collection(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf""" """Get the dictionary of {key: value} config loaded from the collection ArchiveBox.conf"""
class OnlyConfigFileConfig(self.__class__): class OnlyConfigFileConfig(self.__class__):
load_from_defaults = False load_from_defaults = False
load_from_configfile = True load_from_collection = True
load_from_environment = False load_from_environment = False
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
@ -294,7 +296,7 @@ class BaseConfigSet(BaseSettings):
"""Get the dictionary of {key: value} config loaded from the environment variables""" """Get the dictionary of {key: value} config loaded from the environment variables"""
class OnlyEnvironmentConfig(self.__class__): class OnlyEnvironmentConfig(self.__class__):
load_from_defaults = False load_from_defaults = False
load_from_configfile = False load_from_collection = False
load_from_environment = True load_from_environment = True
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))

View file

@ -4,10 +4,9 @@ import json
import os import os
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
from typing_extensions import Self
from pathlib import Path from pathlib import Path
from pydantic import model_validator, AfterValidator from pydantic import AfterValidator
from pydantic_pkgr import BinName from pydantic_pkgr import BinName
from django.utils.functional import cached_property from django.utils.functional import cached_property
from django.utils import timezone from django.utils import timezone
@ -17,36 +16,22 @@ import abx
from .base_binary import BaseBinary from .base_binary import BaseBinary
def no_empty_args(args: List[str]) -> List[str]: def assert_no_empty_args(args: List[str]) -> List[str]:
assert all(len(arg) for arg in args) assert all(len(arg) for arg in args)
return args return args
ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str ExtractorName = Annotated[str, AfterValidator(lambda s: s.isidentifier())]
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)] CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(assert_no_empty_args)]
class BaseExtractor: class BaseExtractor:
name: ExtractorName name: ExtractorName
binary: BinName binary: BinName
output_path_func: HandlerFuncStr = 'self.get_output_path'
should_extract_func: HandlerFuncStr = 'self.should_extract'
extract_func: HandlerFuncStr = 'self.extract'
exec_func: HandlerFuncStr = 'self.exec'
default_args: CmdArgsList = [] default_args: CmdArgsList = []
extra_args: CmdArgsList = [] extra_args: CmdArgsList = []
args: Optional[CmdArgsList] = None
@model_validator(mode='after')
def validate_model(self) -> Self:
if self.args is None:
self.args = [*self.default_args, *self.extra_args]
return self
def get_output_path(self, snapshot) -> Path: def get_output_path(self, snapshot) -> Path:
return Path(self.__class__.__name__.lower()) return Path(self.__class__.__name__.lower())
@ -71,7 +56,7 @@ class BaseExtractor:
snapshot = Snapshot.objects.get(id=snapshot_id) snapshot = Snapshot.objects.get(id=snapshot_id)
if not self.should_extract(snapshot): if not self.should_extract(snapshot.url):
return {} return {}
status = 'failed' status = 'failed'

View file

@ -57,7 +57,7 @@ def get_HOOKS() -> Set[str]:
for hook_name in get_PLUGIN(plugin_id).hooks for hook_name in get_PLUGIN(plugin_id).hooks
} }
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']: def get_CONFIGS() -> benedict: # Dict[str, 'BaseConfigSet']
return benedict({ return benedict({
config_id: configset config_id: configset
for plugin_configs in pm.hook.get_CONFIG() for plugin_configs in pm.hook.get_CONFIG()

View file

@ -88,7 +88,7 @@ def create_root_snapshot_from_seed(crawl):
def create_archiveresults_pending_from_snapshot(snapshot, config): def create_archiveresults_pending_from_snapshot(snapshot, config):
config = get_scope_config( config = get_scope_config(
# defaults=settings.CONFIG_FROM_DEFAULTS, # defaults=settings.CONFIG_FROM_DEFAULTS,
# configfile=settings.CONFIG_FROM_FILE, # collection=settings.CONFIG_FROM_FILE,
# environment=settings.CONFIG_FROM_ENVIRONMENT, # environment=settings.CONFIG_FROM_ENVIRONMENT,
persona=archiveresult.snapshot.crawl.persona, persona=archiveresult.snapshot.crawl.persona,
seed=archiveresult.snapshot.crawl.seed, seed=archiveresult.snapshot.crawl.seed,

View file

@ -15,7 +15,7 @@ from archivebox.misc.logging import stderr
def get_real_name(key: str) -> str: def get_real_name(key: str) -> str:
"""get the current canonical name for a given deprecated config key""" """get the up-to-date canonical name for a given old alias or current key"""
from django.conf import settings from django.conf import settings
for section in settings.CONFIGS.values(): for section in settings.CONFIGS.values():

View file

@ -1,3 +1,15 @@
"""
Constants are for things that never change at runtime.
(but they can change from run-to-run or machine-to-machine)
DATA_DIR will never change at runtime, but you can run
archivebox from inside a different DATA_DIR on the same machine.
This is loaded very early in the archivebox startup flow, so nothing in this file
or imported from this file should import anything from archivebox.config.common,
django, other INSTALLED_APPS, or anything else that is not in a standard library.
"""
__package__ = 'archivebox.config' __package__ = 'archivebox.config'
import re import re
@ -197,10 +209,12 @@ class ConstantsDict(Mapping):
@classmethod @classmethod
def __getitem__(cls, key: str): def __getitem__(cls, key: str):
# so it behaves like a dict[key] == dict.key or object attr
return getattr(cls, key) return getattr(cls, key)
@classmethod @classmethod
def __benedict__(cls): def __benedict__(cls):
# when casting to benedict, only include uppercase keys that don't start with an underscore
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')}) return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
@classmethod @classmethod
@ -214,5 +228,6 @@ class ConstantsDict(Mapping):
CONSTANTS = ConstantsDict() CONSTANTS = ConstantsDict()
CONSTANTS_CONFIG = CONSTANTS.__benedict__() CONSTANTS_CONFIG = CONSTANTS.__benedict__()
# add all key: values to globals() for easier importing # add all key: values to globals() for easier importing, e.g.:
globals().update(CONSTANTS) # from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
# globals().update(CONSTANTS)

View file

@ -22,7 +22,7 @@ from archivebox.misc.logging import stderr, hint
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.permissions import SudoPermission, IN_DOCKER from archivebox.config.permissions import SudoPermission, IN_DOCKER
from archivebox.config.configfile import ( from archivebox.config.collection import (
write_config_file, write_config_file,
load_all_config, load_all_config,
get_real_name, get_real_name,

View file

@ -126,6 +126,7 @@ def is_static_file(url: str):
def enforce_types(func): def enforce_types(func):
""" """
Enforce function arg and kwarg types at runtime using its python3 type hints Enforce function arg and kwarg types at runtime using its python3 type hints
Simpler version of pydantic @validate_call decorator
""" """
# TODO: check return type as well # TODO: check return type as well
@ -283,6 +284,7 @@ def get_headers(url: str, timeout: int=None) -> str:
def ansi_to_html(text: str) -> str: def ansi_to_html(text: str) -> str:
""" """
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
""" """
TEMPLATE = '<span style="color: rgb{}"><br>' TEMPLATE = '<span style="color: rgb{}"><br>'