mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-14 00:17:08 +00:00
rename configfile to collection
This commit is contained in:
parent
63bf902f35
commit
60f0458c77
9 changed files with 41 additions and 37 deletions
|
@ -14,7 +14,6 @@ from pydantic_pkgr import (
|
||||||
EnvProvider,
|
EnvProvider,
|
||||||
)
|
)
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS
|
|
||||||
from archivebox.config.permissions import ARCHIVEBOX_USER
|
from archivebox.config.permissions import ARCHIVEBOX_USER
|
||||||
|
|
||||||
import abx
|
import abx
|
||||||
|
@ -34,6 +33,7 @@ class BaseBinProvider(BinProvider):
|
||||||
return [self]
|
return [self]
|
||||||
|
|
||||||
class BaseBinary(Binary):
|
class BaseBinary(Binary):
|
||||||
|
# TODO: formalize state diagram, final states, transitions, side effects, etc.
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||||
|
|
|
@ -99,7 +99,7 @@ class BaseConfigSet(BaseSettings):
|
||||||
)
|
)
|
||||||
|
|
||||||
load_from_defaults: ClassVar[bool] = True
|
load_from_defaults: ClassVar[bool] = True
|
||||||
load_from_configfile: ClassVar[bool] = True
|
load_from_collection: ClassVar[bool] = True
|
||||||
load_from_environment: ClassVar[bool] = True
|
load_from_environment: ClassVar[bool] = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -128,7 +128,8 @@ class BaseConfigSet(BaseSettings):
|
||||||
try:
|
try:
|
||||||
precedence_order = precedence_order or {
|
precedence_order = precedence_order or {
|
||||||
'defaults': init_settings,
|
'defaults': init_settings,
|
||||||
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
# 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||||
|
'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||||
'environment': env_settings,
|
'environment': env_settings,
|
||||||
}
|
}
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
@ -144,14 +145,15 @@ class BaseConfigSet(BaseSettings):
|
||||||
|
|
||||||
precedence_order = {
|
precedence_order = {
|
||||||
'defaults': init_settings,
|
'defaults': init_settings,
|
||||||
'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
# 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||||
|
'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
|
||||||
'environment': env_settings,
|
'environment': env_settings,
|
||||||
}
|
}
|
||||||
|
|
||||||
if not cls.load_from_environment:
|
if not cls.load_from_environment:
|
||||||
precedence_order.pop('environment')
|
precedence_order.pop('environment')
|
||||||
if not cls.load_from_configfile:
|
if not cls.load_from_collection:
|
||||||
precedence_order.pop('configfile')
|
precedence_order.pop('collection')
|
||||||
if not cls.load_from_defaults:
|
if not cls.load_from_defaults:
|
||||||
precedence_order.pop('defaults')
|
precedence_order.pop('defaults')
|
||||||
|
|
||||||
|
@ -278,15 +280,15 @@ class BaseConfigSet(BaseSettings):
|
||||||
"""Get the dictionary of {key: value} config loaded from the default values"""
|
"""Get the dictionary of {key: value} config loaded from the default values"""
|
||||||
class OnlyDefaultsConfig(self.__class__):
|
class OnlyDefaultsConfig(self.__class__):
|
||||||
load_from_defaults = True
|
load_from_defaults = True
|
||||||
load_from_configfile = False
|
load_from_collection = False
|
||||||
load_from_environment = False
|
load_from_environment = False
|
||||||
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||||
|
|
||||||
def from_configfile(self) -> Dict[str, Any]:
|
def from_collection(self) -> Dict[str, Any]:
|
||||||
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
|
"""Get the dictionary of {key: value} config loaded from the collection ArchiveBox.conf"""
|
||||||
class OnlyConfigFileConfig(self.__class__):
|
class OnlyConfigFileConfig(self.__class__):
|
||||||
load_from_defaults = False
|
load_from_defaults = False
|
||||||
load_from_configfile = True
|
load_from_collection = True
|
||||||
load_from_environment = False
|
load_from_environment = False
|
||||||
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||||
|
|
||||||
|
@ -294,7 +296,7 @@ class BaseConfigSet(BaseSettings):
|
||||||
"""Get the dictionary of {key: value} config loaded from the environment variables"""
|
"""Get the dictionary of {key: value} config loaded from the environment variables"""
|
||||||
class OnlyEnvironmentConfig(self.__class__):
|
class OnlyEnvironmentConfig(self.__class__):
|
||||||
load_from_defaults = False
|
load_from_defaults = False
|
||||||
load_from_configfile = False
|
load_from_collection = False
|
||||||
load_from_environment = True
|
load_from_environment = True
|
||||||
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||||
|
|
||||||
|
|
|
@ -4,10 +4,9 @@ import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
|
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
|
||||||
from typing_extensions import Self
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from pydantic import model_validator, AfterValidator
|
from pydantic import AfterValidator
|
||||||
from pydantic_pkgr import BinName
|
from pydantic_pkgr import BinName
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
@ -17,36 +16,22 @@ import abx
|
||||||
from .base_binary import BaseBinary
|
from .base_binary import BaseBinary
|
||||||
|
|
||||||
|
|
||||||
def no_empty_args(args: List[str]) -> List[str]:
|
def assert_no_empty_args(args: List[str]) -> List[str]:
|
||||||
assert all(len(arg) for arg in args)
|
assert all(len(arg) for arg in args)
|
||||||
return args
|
return args
|
||||||
|
|
||||||
ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
|
ExtractorName = Annotated[str, AfterValidator(lambda s: s.isidentifier())]
|
||||||
|
|
||||||
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||||
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
|
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(assert_no_empty_args)]
|
||||||
|
|
||||||
|
|
||||||
class BaseExtractor:
|
class BaseExtractor:
|
||||||
|
|
||||||
name: ExtractorName
|
name: ExtractorName
|
||||||
binary: BinName
|
binary: BinName
|
||||||
|
|
||||||
output_path_func: HandlerFuncStr = 'self.get_output_path'
|
|
||||||
should_extract_func: HandlerFuncStr = 'self.should_extract'
|
|
||||||
extract_func: HandlerFuncStr = 'self.extract'
|
|
||||||
exec_func: HandlerFuncStr = 'self.exec'
|
|
||||||
|
|
||||||
default_args: CmdArgsList = []
|
default_args: CmdArgsList = []
|
||||||
extra_args: CmdArgsList = []
|
extra_args: CmdArgsList = []
|
||||||
args: Optional[CmdArgsList] = None
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def validate_model(self) -> Self:
|
|
||||||
if self.args is None:
|
|
||||||
self.args = [*self.default_args, *self.extra_args]
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
def get_output_path(self, snapshot) -> Path:
|
def get_output_path(self, snapshot) -> Path:
|
||||||
return Path(self.__class__.__name__.lower())
|
return Path(self.__class__.__name__.lower())
|
||||||
|
@ -71,7 +56,7 @@ class BaseExtractor:
|
||||||
|
|
||||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||||
|
|
||||||
if not self.should_extract(snapshot):
|
if not self.should_extract(snapshot.url):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
|
|
|
@ -57,7 +57,7 @@ def get_HOOKS() -> Set[str]:
|
||||||
for hook_name in get_PLUGIN(plugin_id).hooks
|
for hook_name in get_PLUGIN(plugin_id).hooks
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
|
def get_CONFIGS() -> benedict: # Dict[str, 'BaseConfigSet']
|
||||||
return benedict({
|
return benedict({
|
||||||
config_id: configset
|
config_id: configset
|
||||||
for plugin_configs in pm.hook.get_CONFIG()
|
for plugin_configs in pm.hook.get_CONFIG()
|
||||||
|
|
|
@ -88,7 +88,7 @@ def create_root_snapshot_from_seed(crawl):
|
||||||
def create_archiveresults_pending_from_snapshot(snapshot, config):
|
def create_archiveresults_pending_from_snapshot(snapshot, config):
|
||||||
config = get_scope_config(
|
config = get_scope_config(
|
||||||
# defaults=settings.CONFIG_FROM_DEFAULTS,
|
# defaults=settings.CONFIG_FROM_DEFAULTS,
|
||||||
# configfile=settings.CONFIG_FROM_FILE,
|
# collection=settings.CONFIG_FROM_FILE,
|
||||||
# environment=settings.CONFIG_FROM_ENVIRONMENT,
|
# environment=settings.CONFIG_FROM_ENVIRONMENT,
|
||||||
persona=archiveresult.snapshot.crawl.persona,
|
persona=archiveresult.snapshot.crawl.persona,
|
||||||
seed=archiveresult.snapshot.crawl.seed,
|
seed=archiveresult.snapshot.crawl.seed,
|
||||||
|
|
|
@ -15,7 +15,7 @@ from archivebox.misc.logging import stderr
|
||||||
|
|
||||||
|
|
||||||
def get_real_name(key: str) -> str:
|
def get_real_name(key: str) -> str:
|
||||||
"""get the current canonical name for a given deprecated config key"""
|
"""get the up-to-date canonical name for a given old alias or current key"""
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
for section in settings.CONFIGS.values():
|
for section in settings.CONFIGS.values():
|
||||||
|
|
|
@ -1,3 +1,15 @@
|
||||||
|
"""
|
||||||
|
Constants are for things that never change at runtime.
|
||||||
|
(but they can change from run-to-run or machine-to-machine)
|
||||||
|
|
||||||
|
DATA_DIR will never change at runtime, but you can run
|
||||||
|
archivebox from inside a different DATA_DIR on the same machine.
|
||||||
|
|
||||||
|
This is loaded very early in the archivebox startup flow, so nothing in this file
|
||||||
|
or imported from this file should import anything from archivebox.config.common,
|
||||||
|
django, other INSTALLED_APPS, or anything else that is not in a standard library.
|
||||||
|
"""
|
||||||
|
|
||||||
__package__ = 'archivebox.config'
|
__package__ = 'archivebox.config'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
@ -197,10 +209,12 @@ class ConstantsDict(Mapping):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __getitem__(cls, key: str):
|
def __getitem__(cls, key: str):
|
||||||
|
# so it behaves like a dict[key] == dict.key or object attr
|
||||||
return getattr(cls, key)
|
return getattr(cls, key)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __benedict__(cls):
|
def __benedict__(cls):
|
||||||
|
# when casting to benedict, only include uppercase keys that don't start with an underscore
|
||||||
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
|
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -214,5 +228,6 @@ class ConstantsDict(Mapping):
|
||||||
CONSTANTS = ConstantsDict()
|
CONSTANTS = ConstantsDict()
|
||||||
CONSTANTS_CONFIG = CONSTANTS.__benedict__()
|
CONSTANTS_CONFIG = CONSTANTS.__benedict__()
|
||||||
|
|
||||||
# add all key: values to globals() for easier importing
|
# add all key: values to globals() for easier importing, e.g.:
|
||||||
globals().update(CONSTANTS)
|
# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
|
||||||
|
# globals().update(CONSTANTS)
|
||||||
|
|
|
@ -22,7 +22,7 @@ from archivebox.misc.logging import stderr, hint
|
||||||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
|
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
|
||||||
from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||||
from archivebox.config.permissions import SudoPermission, IN_DOCKER
|
from archivebox.config.permissions import SudoPermission, IN_DOCKER
|
||||||
from archivebox.config.configfile import (
|
from archivebox.config.collection import (
|
||||||
write_config_file,
|
write_config_file,
|
||||||
load_all_config,
|
load_all_config,
|
||||||
get_real_name,
|
get_real_name,
|
||||||
|
|
|
@ -126,6 +126,7 @@ def is_static_file(url: str):
|
||||||
def enforce_types(func):
|
def enforce_types(func):
|
||||||
"""
|
"""
|
||||||
Enforce function arg and kwarg types at runtime using its python3 type hints
|
Enforce function arg and kwarg types at runtime using its python3 type hints
|
||||||
|
Simpler version of pydantic @validate_call decorator
|
||||||
"""
|
"""
|
||||||
# TODO: check return type as well
|
# TODO: check return type as well
|
||||||
|
|
||||||
|
@ -283,6 +284,7 @@ def get_headers(url: str, timeout: int=None) -> str:
|
||||||
def ansi_to_html(text: str) -> str:
|
def ansi_to_html(text: str) -> str:
|
||||||
"""
|
"""
|
||||||
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
||||||
|
Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
TEMPLATE = '<span style="color: rgb{}"><br>'
|
TEMPLATE = '<span style="color: rgb{}"><br>'
|
||||||
|
|
Loading…
Reference in a new issue