mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-14 00:17:08 +00:00
rename configfile to collection
This commit is contained in:
parent
63bf902f35
commit
60f0458c77
9 changed files with 41 additions and 37 deletions
|
@ -14,7 +14,6 @@ from pydantic_pkgr import (
|
|||
EnvProvider,
|
||||
)
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER
|
||||
|
||||
import abx
|
||||
|
@ -34,6 +33,7 @@ class BaseBinProvider(BinProvider):
|
|||
return [self]
|
||||
|
||||
class BaseBinary(Binary):
|
||||
# TODO: formalize state diagram, final states, transitions, side effects, etc.
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||
|
|
|
@ -99,7 +99,7 @@ class BaseConfigSet(BaseSettings):
|
|||
)
|
||||
|
||||
load_from_defaults: ClassVar[bool] = True
|
||||
load_from_configfile: ClassVar[bool] = True
|
||||
load_from_collection: ClassVar[bool] = True
|
||||
load_from_environment: ClassVar[bool] = True
|
||||
|
||||
@classmethod
|
||||
|
@ -128,7 +128,8 @@ class BaseConfigSet(BaseSettings):
|
|||
try:
|
||||
precedence_order = precedence_order or {
|
||||
'defaults': init_settings,
|
||||
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||
# 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||
'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||
'environment': env_settings,
|
||||
}
|
||||
except Exception as err:
|
||||
|
@ -144,14 +145,15 @@ class BaseConfigSet(BaseSettings):
|
|||
|
||||
precedence_order = {
|
||||
'defaults': init_settings,
|
||||
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||
# 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||
'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||
'environment': env_settings,
|
||||
}
|
||||
|
||||
if not cls.load_from_environment:
|
||||
precedence_order.pop('environment')
|
||||
if not cls.load_from_configfile:
|
||||
precedence_order.pop('configfile')
|
||||
if not cls.load_from_collection:
|
||||
precedence_order.pop('collection')
|
||||
if not cls.load_from_defaults:
|
||||
precedence_order.pop('defaults')
|
||||
|
||||
|
@ -278,15 +280,15 @@ class BaseConfigSet(BaseSettings):
|
|||
"""Get the dictionary of {key: value} config loaded from the default values"""
|
||||
class OnlyDefaultsConfig(self.__class__):
|
||||
load_from_defaults = True
|
||||
load_from_configfile = False
|
||||
load_from_collection = False
|
||||
load_from_environment = False
|
||||
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
def from_configfile(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
|
||||
def from_collection(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the collection ArchiveBox.conf"""
|
||||
class OnlyConfigFileConfig(self.__class__):
|
||||
load_from_defaults = False
|
||||
load_from_configfile = True
|
||||
load_from_collection = True
|
||||
load_from_environment = False
|
||||
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
|
@ -294,7 +296,7 @@ class BaseConfigSet(BaseSettings):
|
|||
"""Get the dictionary of {key: value} config loaded from the environment variables"""
|
||||
class OnlyEnvironmentConfig(self.__class__):
|
||||
load_from_defaults = False
|
||||
load_from_configfile = False
|
||||
load_from_collection = False
|
||||
load_from_environment = True
|
||||
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
|
|
|
@ -4,10 +4,9 @@ import json
|
|||
import os
|
||||
|
||||
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
|
||||
from typing_extensions import Self
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import model_validator, AfterValidator
|
||||
from pydantic import AfterValidator
|
||||
from pydantic_pkgr import BinName
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils import timezone
|
||||
|
@ -17,36 +16,22 @@ import abx
|
|||
from .base_binary import BaseBinary
|
||||
|
||||
|
||||
def no_empty_args(args: List[str]) -> List[str]:
|
||||
def assert_no_empty_args(args: List[str]) -> List[str]:
|
||||
assert all(len(arg) for arg in args)
|
||||
return args
|
||||
|
||||
ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
|
||||
ExtractorName = Annotated[str, AfterValidator(lambda s: s.isidentifier())]
|
||||
|
||||
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
|
||||
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(assert_no_empty_args)]
|
||||
|
||||
|
||||
class BaseExtractor:
|
||||
|
||||
name: ExtractorName
|
||||
binary: BinName
|
||||
|
||||
output_path_func: HandlerFuncStr = 'self.get_output_path'
|
||||
should_extract_func: HandlerFuncStr = 'self.should_extract'
|
||||
extract_func: HandlerFuncStr = 'self.extract'
|
||||
exec_func: HandlerFuncStr = 'self.exec'
|
||||
|
||||
default_args: CmdArgsList = []
|
||||
extra_args: CmdArgsList = []
|
||||
args: Optional[CmdArgsList] = None
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_model(self) -> Self:
|
||||
if self.args is None:
|
||||
self.args = [*self.default_args, *self.extra_args]
|
||||
return self
|
||||
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(self.__class__.__name__.lower())
|
||||
|
@ -71,7 +56,7 @@ class BaseExtractor:
|
|||
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
if not self.should_extract(snapshot):
|
||||
if not self.should_extract(snapshot.url):
|
||||
return {}
|
||||
|
||||
status = 'failed'
|
||||
|
|
|
@ -57,7 +57,7 @@ def get_HOOKS() -> Set[str]:
|
|||
for hook_name in get_PLUGIN(plugin_id).hooks
|
||||
}
|
||||
|
||||
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
|
||||
def get_CONFIGS() -> benedict: # Dict[str, 'BaseConfigSet']
|
||||
return benedict({
|
||||
config_id: configset
|
||||
for plugin_configs in pm.hook.get_CONFIG()
|
||||
|
|
|
@ -88,7 +88,7 @@ def create_root_snapshot_from_seed(crawl):
|
|||
def create_archiveresults_pending_from_snapshot(snapshot, config):
|
||||
config = get_scope_config(
|
||||
# defaults=settings.CONFIG_FROM_DEFAULTS,
|
||||
# configfile=settings.CONFIG_FROM_FILE,
|
||||
# collection=settings.CONFIG_FROM_FILE,
|
||||
# environment=settings.CONFIG_FROM_ENVIRONMENT,
|
||||
persona=archiveresult.snapshot.crawl.persona,
|
||||
seed=archiveresult.snapshot.crawl.seed,
|
||||
|
|
|
@ -15,7 +15,7 @@ from archivebox.misc.logging import stderr
|
|||
|
||||
|
||||
def get_real_name(key: str) -> str:
|
||||
"""get the current canonical name for a given deprecated config key"""
|
||||
"""get the up-to-date canonical name for a given old alias or current key"""
|
||||
from django.conf import settings
|
||||
|
||||
for section in settings.CONFIGS.values():
|
||||
|
|
|
@ -1,3 +1,15 @@
|
|||
"""
|
||||
Constants are for things that never change at runtime.
|
||||
(but they can change from run-to-run or machine-to-machine)
|
||||
|
||||
DATA_DIR will never change at runtime, but you can run
|
||||
archivebox from inside a different DATA_DIR on the same machine.
|
||||
|
||||
This is loaded very early in the archivebox startup flow, so nothing in this file
|
||||
or imported from this file should import anything from archivebox.config.common,
|
||||
django, other INSTALLED_APPS, or anything else that is not in a standard library.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.config'
|
||||
|
||||
import re
|
||||
|
@ -197,10 +209,12 @@ class ConstantsDict(Mapping):
|
|||
|
||||
@classmethod
|
||||
def __getitem__(cls, key: str):
|
||||
# so it behaves like a dict[key] == dict.key or object attr
|
||||
return getattr(cls, key)
|
||||
|
||||
@classmethod
|
||||
def __benedict__(cls):
|
||||
# when casting to benedict, only include uppercase keys that don't start with an underscore
|
||||
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
|
||||
|
||||
@classmethod
|
||||
|
@ -214,5 +228,6 @@ class ConstantsDict(Mapping):
|
|||
CONSTANTS = ConstantsDict()
|
||||
CONSTANTS_CONFIG = CONSTANTS.__benedict__()
|
||||
|
||||
# add all key: values to globals() for easier importing
|
||||
globals().update(CONSTANTS)
|
||||
# add all key: values to globals() for easier importing, e.g.:
|
||||
# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
|
||||
# globals().update(CONSTANTS)
|
||||
|
|
|
@ -22,7 +22,7 @@ from archivebox.misc.logging import stderr, hint
|
|||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
|
||||
from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||
from archivebox.config.permissions import SudoPermission, IN_DOCKER
|
||||
from archivebox.config.configfile import (
|
||||
from archivebox.config.collection import (
|
||||
write_config_file,
|
||||
load_all_config,
|
||||
get_real_name,
|
||||
|
|
|
@ -126,6 +126,7 @@ def is_static_file(url: str):
|
|||
def enforce_types(func):
|
||||
"""
|
||||
Enforce function arg and kwarg types at runtime using its python3 type hints
|
||||
Simpler version of pydantic @validate_call decorator
|
||||
"""
|
||||
# TODO: check return type as well
|
||||
|
||||
|
@ -283,6 +284,7 @@ def get_headers(url: str, timeout: int=None) -> str:
|
|||
def ansi_to_html(text: str) -> str:
|
||||
"""
|
||||
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
||||
Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
|
||||
"""
|
||||
|
||||
TEMPLATE = '<span style="color: rgb{}"><br>'
|
||||
|
|
Loading…
Reference in a new issue