finish migrating almost all config to new system

This commit is contained in:
Nick Sweeting 2024-09-30 23:21:34 -07:00
parent 4b6a2a3e50
commit d21bc86075
No known key found for this signature in database
25 changed files with 246 additions and 349 deletions

View file

@ -13,43 +13,6 @@ HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', '
hook_type_names: Tuple[HookType] = get_args(HookType)
class BaseHook(BaseModel):
"""
A Plugin consists of a list of Hooks, applied to django.conf.settings when AppConfig.read() -> Plugin.register() is called.
Plugin.register() then calls each Hook.register() on the provided settings.
each Hook.regsiter() function (ideally pure) takes a django.conf.settings as input and returns a new one back.
or
it modifies django.conf.settings in-place to add changes corresponding to its HookType.
e.g. for a HookType.CONFIG, the Hook.register() function places the hook in settings.CONFIG (and settings.HOOKS)
An example of an impure Hook would be a CHECK that modifies settings but also calls django.core.checks.register(check).
In practice any object that subclasses BaseHook and provides a .register() function can behave as a Hook.
setup_django() -> imports all settings.INSTALLED_APPS...
# django imports AppConfig, models, migrations, admins, etc. for all installed apps
# django then calls AppConfig.ready() on each installed app...
plugins_pkg.npm.NpmPlugin().AppConfig.ready() # called by django
plugins_pkg.npm.NpmPlugin().register(settings) ->
plugins_pkg.npm.NpmConfigSet().register(settings)
abx.archivebox.base_configset.BaseConfigSet().register(settings)
abx.archivebox.base_hook.BaseHook().register(settings, parent_plugin=plugins_pkg.npm.NpmPlugin())
...
...
Both core ArchiveBox code and plugin code depend on python >= 3.10 and django >= 5.0 w/ sqlite and a filesystem.
Core ArchiveBox code can depend only on python and the pip libraries it ships with, and can never depend on plugin code / node / other binaries.
Plugin code can depend on archivebox core, other django apps, other pip libraries, and other plugins.
Plugins can provide BinProviders + Binaries which can depend on arbitrary other binaries / package managers like curl / wget / yt-dlp / etc.
The execution interface between plugins is simply calling builtinplugins.npm.... functions directly, django handles
importing all plugin code. There is no need to manually register methods/classes, only register to call
impure setup functions or provide runtime state.
settings.CONFIGS / settings.BINPROVIDERS / settings.BINARIES /... etc. are reserved for dynamic runtime state only.
This state is exposed to the broader system in a flat namespace, e.g. CONFIG.IS_DOCKER=True, or BINARIES = [
..., Binary('node', abspath='/usr/local/bin/node', version='22.2.0'), ...
]
"""
model_config = ConfigDict(
extra="allow",
arbitrary_types_allowed=True,

View file

@ -13,7 +13,7 @@ from ..main import (
schedule,
)
from archivebox.misc.util import ansi_to_html
from ..config.legacy import ONLY_NEW
from archivebox.config import ARCHIVING_CONFIG
from .auth import API_AUTH_METHODS
@ -58,7 +58,7 @@ class AddCommandSchema(Schema):
urls: List[str]
tag: str = ""
depth: int = 0
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
update_all: bool = False
index_only: bool = False
overwrite: bool = False
@ -68,7 +68,7 @@ class AddCommandSchema(Schema):
class UpdateCommandSchema(Schema):
resume: Optional[float] = 0
only_new: bool = ONLY_NEW
only_new: bool = ARCHIVING_CONFIG.ONLY_NEW
index_only: bool = False
overwrite: bool = False
after: Optional[float] = 0
@ -85,7 +85,7 @@ class ScheduleCommandSchema(Schema):
tag: str = ''
depth: int = 0
overwrite: bool = False
update: bool = not ONLY_NEW
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
clear: bool = False
class ListCommandSchema(Schema):

View file

@ -152,18 +152,15 @@ def run_subcommand(subcommand: str,
subcommand_args = subcommand_args or []
if subcommand not in meta_cmds:
from ..config.legacy import setup_django, CONFIG
from archivebox.config.legacy import setup_django
cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
if cmd_requires_db:
check_data_folder(CONFIG)
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
if cmd_requires_db:
check_migrations(CONFIG)
check_migrations()
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore

View file

@ -1,6 +1,6 @@
__package__ = 'archivebox.config'
from .constants import CONSTANTS, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
from .defaults import (
SHELL_CONFIG,
STORAGE_CONFIG,
@ -23,4 +23,5 @@ __all__ = [
'SERVER_CONFIG',
'ARCHIVING_CONFIG',
'SEARCH_BACKEND_CONFIG',
'CONSTANTS_CONFIG',
]

View file

@ -60,6 +60,7 @@ from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CON
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
ANSI = SHELL_CONFIG.ANSI
LDAP = LDAP_CONFIG.LDAP_ENABLED
@ -81,9 +82,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
# 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(),
'ARCHIVE_METHOD_TOGGLES': {
@ -109,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
# 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
@ -144,15 +147,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
]},
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
'CURL_ARGS': {'type': list, 'default': ['--silent',
'--location',
'--compressed'
]},
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
},
'DEPENDENCY_CONFIG': {
@ -164,9 +158,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'USE_YOUTUBEDL': {'type': bool, 'default': True},
'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'NODE_BINARY': {'type': str, 'default': 'node'},
# 'GIT_BINARY': {'type': str, 'default': 'git'},
# 'CURL_BINARY': {'type': str, 'default': 'curl'},
# 'NODE_BINARY': {'type': str, 'default': 'node'},
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
@ -209,21 +203,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
# 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
# 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
# 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
# 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
# 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
@ -613,13 +598,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
# 'is_valid': True,
# },
'CURL_BINARY': {
'path': bin_path(config['CURL_BINARY']),
'version': config['CURL_VERSION'],
'hash': bin_hash(config['CURL_BINARY']),
'enabled': config['USE_CURL'],
'is_valid': bool(config['CURL_VERSION']),
},
# 'CURL_BINARY': {
# 'path': bin_path(config['CURL_BINARY']),
# 'version': config['CURL_VERSION'],
# 'hash': bin_hash(config['CURL_BINARY']),
# 'enabled': config['USE_CURL'],
# 'is_valid': bool(config['CURL_VERSION']),
# },
# 'WGET_BINARY': {
# 'path': bin_path(config['WGET_BINARY']),
# 'version': config['WGET_VERSION'],
@ -641,13 +626,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
# 'enabled': config['USE_MERCURY'],
# 'is_valid': bool(config['MERCURY_VERSION']),
# },
'GIT_BINARY': {
'path': bin_path(config['GIT_BINARY']),
'version': config['GIT_VERSION'],
'hash': bin_hash(config['GIT_BINARY']),
'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']),
},
# 'GIT_BINARY': {
# 'path': bin_path(config['GIT_BINARY']),
# 'version': config['GIT_VERSION'],
# 'hash': bin_hash(config['GIT_BINARY']),
# 'enabled': config['USE_GIT'],
# 'is_valid': bool(config['GIT_VERSION']),
# },
# 'SINGLEFILE_BINARY': {
# 'path': bin_path(config['SINGLEFILE_BINARY']),
# 'version': config['SINGLEFILE_VERSION'],

View file

@ -76,7 +76,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
relevant_configs = {
key: val
for key, val in settings.CONFIG.items()
for key, val in settings.FLAT_CONFIG.items()
if '_BINARY' in key or '_VERSION' in key
}
@ -105,6 +105,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
for config_key, config_value in relevant_configs.items()
if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
or config_value.lower().endswith(binary.name.lower())
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
)))
# if not binary.provider_overrides:

View file

@ -36,7 +36,7 @@ from main import remove
from extractors import archive_links
CONFIG = settings.CONFIG
CONFIG = settings.FLAT_CONFIG
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}

View file

@ -1,13 +1,11 @@
__package__ = 'archivebox.core'
from ..config.legacy import (
LDAP
)
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
def register_signals():
if LDAP:
if LDAP_CONFIG.LDAP_ENABLED:
import django_auth_ldap.backend
from .auth_ldap import create_user

View file

@ -1,9 +1,7 @@
from ..config.legacy import (
LDAP_CREATE_SUPERUSER
)
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
def create_user(sender, user=None, ldap_user=None, **kwargs):
if not user.id and LDAP_CREATE_SUPERUSER:
if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
user.is_superuser = True
user.is_staff = True

View file

@ -5,7 +5,7 @@ from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware
from django.core.exceptions import ImproperlyConfigured
from ..config.legacy import PUBLIC_SNAPSHOTS, REVERSE_PROXY_USER_HEADER, REVERSE_PROXY_WHITELIST
from archivebox.config import SERVER_CONFIG
def detect_timezone(request, activate: bool=True):
@ -32,7 +32,7 @@ def CacheControlMiddleware(get_response):
response = get_response(request)
if '/archive/' in request.path or '/static/' in request.path:
policy = 'public' if PUBLIC_SNAPSHOTS else 'private'
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
# print('Set Cache-Control header to', response['Cache-Control'])
return response
@ -40,15 +40,15 @@ def CacheControlMiddleware(get_response):
return middleware
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
header = 'HTTP_{normalized}'.format(normalized=REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
def process_request(self, request):
if REVERSE_PROXY_WHITELIST == '':
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
return
ip = request.META.get('REMOTE_ADDR')
for cidr in REVERSE_PROXY_WHITELIST.split(','):
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
try:
network = ipaddress.ip_network(cidr)
except ValueError:

View file

@ -13,9 +13,7 @@ import abx.archivebox
import abx.archivebox.use
import abx.django.use
from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from ..config.legacy import CONFIG
from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG # noqa
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
@ -80,7 +78,7 @@ LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/')
PASSWORD_RESET_URL = '/accounts/password_reset/'
APPEND_SLASH = True
DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv)
INSTALLED_APPS = [
@ -364,10 +362,10 @@ STORAGES = {
### Security Settings
################################################################################
SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',')
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS

View file

@ -10,7 +10,7 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC
from .serve_static import serve_static
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from .config.legacy import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}

View file

@ -1,7 +1,7 @@
__package__ = 'archivebox.core'
from typing import Callable
from benedict import benedict
import inspect
from typing import Callable, get_type_hints
from pathlib import Path
from django.shortcuts import render, redirect
@ -27,21 +27,13 @@ from core.admin import result_url
from queues.tasks import bg_add
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from ..config.legacy import (
CONFIG_SCHEMA,
DYNAMIC_CONFIG_SCHEMA,
USER_CONFIG,
CONFIG,
)
from ..logging_util import printable_filesize
from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from ..search import query_search_index
from .serve_static import serve_static_with_byterange_support
CONFIG = benedict({**CONSTANTS, **CONFIG, **settings.FLAT_CONFIG})
from .serve_static import serve_static_with_byterange_support
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from ..logging_util import printable_filesize
from ..search import query_search_index
class HomepageView(View):
@ -502,27 +494,43 @@ class HealthCheckView(View):
def find_config_section(key: str) -> str:
if key in CONSTANTS:
if key in CONSTANTS_CONFIG:
return 'CONSTANT'
matching_sections = [
name for name, opts in CONFIG_SCHEMA.items() if key in opts
section.id for section in settings.CONFIGS.values() if key in section.model_fields
]
section = matching_sections[0] if matching_sections else 'DYNAMIC'
return section
def find_config_default(key: str) -> str:
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
if key in CONSTANTS_CONFIG:
return str(CONSTANTS_CONFIG[key])
default_val = None
for config in settings.CONFIGS.values():
if key in config.model_fields:
default_val = config.model_fields[key].default
break
if isinstance(default_val, Callable):
return None
default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip()
if default_val.count(')') > default_val.count('('):
default_val = default_val[:-1]
else:
default_val = repr(default_val)
default_val = str(default_val)
return default_val
def find_config_type(key: str) -> str:
if key in USER_CONFIG:
return str(USER_CONFIG[key]['type'])
elif key in DYNAMIC_CONFIG_SCHEMA:
return str(type(CONFIG[key]))
for config in settings.CONFIGS.values():
if hasattr(config, key):
type_hints = get_type_hints(config)
try:
return str(type_hints[key].__name__)
except AttributeError:
return str(type_hints[key])
return 'str'
def key_is_safe(key: str) -> bool:
@ -543,40 +551,29 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Value": [],
"Default": [],
# "Documentation": [],
"Aliases": [],
# "Aliases": [],
}
for section in CONFIG_SCHEMA.keys():
for key in CONFIG_SCHEMA[section].keys():
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
for section in reversed(list(settings.CONFIGS.values())):
for key, field in section.model_fields.items():
rows['Section'].append(section.id) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
rows['Value'].append(mark_safe(f'<code>{getattr(section, key)}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
section = 'DYNAMIC'
for key in DYNAMIC_CONFIG_SCHEMA.keys():
if key in CONSTANTS:
continue
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
section = 'CONSTANT'
for key in CONSTANTS.keys():
for key in CONSTANTS_CONFIG.keys():
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
rows['Type'].append(format_html('<code>{}</code>', getattr(type(CONSTANTS_CONFIG[key]), '__name__', repr(CONSTANTS_CONFIG[key]))))
rows['Value'].append(format_html('<code>{}</code>', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
# rows['Aliases'].append('')
return TableContext(
@ -589,11 +586,12 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
# aliases = USER_CONFIG.get(key, {}).get("aliases", [])
aliases = []
if key in CONSTANTS:
if key in CONSTANTS_CONFIG:
section_header = mark_safe(f'[CONSTANTS] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
elif key in USER_CONFIG:
elif key in settings.FLAT_CONFIG:
section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>')
else:
section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
@ -609,7 +607,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': CONFIG[key] if key_is_safe(key) else '********',
'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********',
},
"help_texts": {
'Key': mark_safe(f'''
@ -619,25 +617,25 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
See full definition in <code>archivebox/config.py</code>...
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
Default: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
<code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
</a>
<br/><br/>
<p style="display: {"block" if key in USER_CONFIG else "none"}">
<p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
(repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),

View file

@ -7,21 +7,10 @@ from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config.legacy import (
TIMEOUT,
CURL_ARGS,
CURL_EXTRA_ARGS,
CHECK_SSL_VALIDITY,
SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY,
CURL_VERSION,
CURL_USER_AGENT,
)
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from ..logging_util import TimedProgress
@ -39,27 +28,30 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG
return ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
@enforce_types
def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url"""
curl_binary = CURL_BINARY.load()
assert curl_binary.abspath and curl_binary.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
*CURL_CONFIG.CURL_ARGS,
*CURL_CONFIG.CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
str(curl_binary.abspath),
*dedupe(options),
submit_url,
]
@ -97,22 +89,22 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CURL_VERSION,
cmd_version=str(curl_binary.version),
output=output,
status=status,
**timer.stats,
)
@enforce_types
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
def parse_archive_dot_org_response(response: str) -> Tuple[List[str], List[str]]:
# Parse archive.org response headers
headers: Dict[str, List[str]] = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
if ':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
name, val = header.split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors

View file

@ -2,16 +2,11 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import chmod_file, run
from archivebox.misc.util import (
enforce_types,
domain,
dedupe,
)
from ..config.legacy import CONFIG
from archivebox.misc.util import enforce_types, domain, dedupe
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress
@ -22,7 +17,7 @@ def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite:
if not overwrite and (out_dir / 'favicon.ico').exists():
return False
return CONFIG.SAVE_FAVICON
return FAVICON_CONFIG.SAVE_FAVICON
@enforce_types
def get_output_path():
@ -30,26 +25,29 @@ def get_output_path():
@enforce_types
def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""
curl_binary = CURL_BINARY.load()
assert curl_binary.abspath and curl_binary.version
out_dir = Path(out_dir or link.link_dir)
assert out_dir.exists()
output: ArchiveOutput = 'favicon.ico'
# later options take precedence
options = [
*CONFIG.CURL_ARGS,
*CONFIG.CURL_EXTRA_ARGS,
*CURL_CONFIG.CURL_ARGS,
*CURL_CONFIG.CURL_EXTRA_ARGS,
'--max-time', str(timeout),
'--output', str(output),
*(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []),
*([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']),
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CONFIG.CURL_BINARY,
str(curl_binary.abspath),
*dedupe(options),
CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
FAVICON_CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
]
status = 'failed'
timer = TimedProgress(timeout, prefix=' ')
@ -65,7 +63,7 @@ def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFI
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CONFIG.CURL_VERSION,
cmd_version=str(curl_binary.version),
output=output,
status=status,
**timer.stats,

View file

@ -4,7 +4,6 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import (
enforce_types,
@ -14,8 +13,9 @@ from archivebox.misc.util import (
without_query,
without_fragment,
)
from ..config.legacy import CONFIG
from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
def get_output_path():
@ -42,28 +42,31 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False
is_clonable_url = (
(domain(link.url) in CONFIG.GIT_DOMAINS)
(domain(link.url) in GIT_CONFIG.GIT_DOMAINS)
or (extension(link.url) == 'git')
)
if not is_clonable_url:
return False
return CONFIG.SAVE_GIT
return GIT_CONFIG.SAVE_GIT
@enforce_types
def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult:
"""download full site using git"""
git_binary = GIT_BINARY.load()
assert git_binary.abspath and git_binary.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
cmd = [
CONFIG.GIT_BINARY,
str(git_binary.abspath),
'clone',
*CONFIG.GIT_ARGS,
*([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
*GIT_CONFIG.GIT_ARGS,
*([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)),
]
status = 'succeeded'
@ -88,7 +91,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEO
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CONFIG.GIT_VERSION,
cmd_version=str(git_binary.version),
output=output,
status=status,
**timer.stats,

View file

@ -4,23 +4,14 @@ from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import atomic_write
from archivebox.misc.util import (
enforce_types,
get_headers,
dedupe,
)
from ..config.legacy import (
TIMEOUT,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_USER_AGENT,
CURL_VERSION,
CHECK_SSL_VALIDITY,
SAVE_HEADERS
)
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress
def get_output_path():
@ -29,34 +20,38 @@ def get_output_path():
@enforce_types
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
out_dir_path = Path(out_dir or link.link_dir)
assert out_dir_path
if not overwrite and (out_dir_path / get_output_path()).exists():
return False
return SAVE_HEADERS
return CURL_CONFIG.SAVE_HEADERS
@enforce_types
def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""Download site headers"""
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute()
curl_binary = CURL_BINARY.load()
assert curl_binary.abspath and curl_binary.version
out_dir_path = Path(out_dir or link.link_dir)
output_folder = out_dir_path.absolute()
output: ArchiveOutput = get_output_path()
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
timer = TimedProgress(timeout + 1, prefix=' ')
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
*CURL_CONFIG.CURL_ARGS,
*CURL_CONFIG.CURL_EXTRA_ARGS,
'--head',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
str(curl_binary.abspath),
*dedupe(options),
link.url,
]
@ -72,8 +67,8 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CURL_VERSION,
pwd=str(out_dir_path),
cmd_version=str(curl_binary.version),
output=output,
status=status,
**timer.stats,

View file

@ -5,18 +5,13 @@ import io
from pathlib import Path
from typing import Optional
from archivebox.config import VERSION
from ..config.legacy import (
SAVE_HTMLTOTEXT,
TIMEOUT,
)
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress
from archivebox.config import VERSION, ARCHIVING_CONFIG
from archivebox.config.legacy import SAVE_HTMLTOTEXT
from archivebox.misc.system import atomic_write
from archivebox.misc.util import (
enforce_types,
is_static_file,
)
from archivebox.misc.util import enforce_types, is_static_file
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveError
from .title import get_html
@ -122,7 +117,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
@enforce_types
def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=ARCHIVING_CONFIG.TIMEOUT) -> ArchiveResult:
"""extract search-indexing-friendly text from an HTML document"""
out_dir = Path(out_dir or link.link_dir)

View file

@ -5,23 +5,14 @@ from html.parser import HTMLParser
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.util import (
enforce_types,
download_url,
htmldecode,
dedupe,
)
from ..config.legacy import (
TIMEOUT,
CHECK_SSL_VALIDITY,
SAVE_TITLE,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CURL_USER_AGENT,
)
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
@ -62,7 +53,7 @@ class TitleParser(HTMLParser):
@enforce_types
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
def get_html(link: Link, path: Path, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> str:
"""
Try to find wget, singlefile and then dom files.
If none is found, download the url again.
@ -98,7 +89,7 @@ def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Option
if not overwrite and link.title and not link.title.lower().startswith('http'):
return False
return SAVE_TITLE
return CURL_CONFIG.SAVE_TITLE
def extract_title_with_regex(html):
match = re.search(HTML_TITLE_REGEX, html)
@ -106,22 +97,25 @@ def extract_title_with_regex(html):
return output
@enforce_types
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
from core.models import Snapshot
curl_binary = CURL_BINARY.load()
assert curl_binary.abspath and curl_binary.version
output: ArchiveOutput = None
# later options take precedence
options = [
*CURL_ARGS,
*CURL_EXTRA_ARGS,
*CURL_CONFIG.CURL_ARGS,
*CURL_CONFIG.CURL_EXTRA_ARGS,
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
]
cmd = [
CURL_BINARY,
str(curl_binary.abspath),
*dedupe(options),
link.url,
]
@ -161,7 +155,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CURL_VERSION,
cmd_version=str(curl_binary.version),
output=output,
status=status,
**timer.stats,

View file

@ -430,7 +430,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
def status(out_dir: Path=DATA_DIR) -> None:
"""Print out some info and statistics about the archive collection"""
check_data_folder(CONFIG)
check_data_folder()
from core.models import Snapshot
from django.contrib.auth import get_user_model
@ -573,7 +573,7 @@ def add(urls: Union[str, List[str]],
run_subcommand('init', stdin=None, pwd=out_dir)
# Load list of links from the existing index
check_data_folder(CONFIG)
check_data_folder()
# worker = start_cli_workers()
@ -673,7 +673,7 @@ def remove(filter_str: Optional[str]=None,
out_dir: Path=DATA_DIR) -> List[Link]:
"""Remove the specified URLs from the archive"""
check_data_folder(CONFIG)
check_data_folder()
if snapshots is None:
if filter_str and filter_patterns:
@ -762,7 +762,7 @@ def update(resume: Optional[float]=None,
# from .queues.supervisor_util import start_cli_workers
check_data_folder(CONFIG)
check_data_folder()
# start_cli_workers()
new_links: List[Link] = [] # TODO: Remove input argument: only_new
@ -833,7 +833,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
out_dir: Path=DATA_DIR) -> Iterable[Link]:
"""List, filter, and export information about archive entries"""
check_data_folder(CONFIG)
check_data_folder()
if filter_patterns and filter_patterns_str:
stderr(
@ -881,7 +881,7 @@ def list_links(snapshots: Optional[QuerySet]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> Iterable[Link]:
check_data_folder(CONFIG)
check_data_folder()
if snapshots:
all_snapshots = snapshots
@ -905,7 +905,7 @@ def list_folders(links: List[Link],
status: str,
out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]:
check_data_folder(CONFIG)
check_data_folder()
STATUS_FUNCTIONS = {
"indexed": get_indexed_folders,
@ -926,7 +926,7 @@ def list_folders(links: List[Link],
raise ValueError('Status not recognized.')
@enforce_types
def setup(out_dir: Path=DATA_DIR) -> None:
def install(out_dir: Path=DATA_DIR) -> None:
"""Automatically install all ArchiveBox dependencies and extras"""
from rich import print
@ -937,40 +937,20 @@ def setup(out_dir: Path=DATA_DIR) -> None:
stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
for binary in settings.BINARIES.values():
for binary in reversed(list(settings.BINARIES.values())):
try:
print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
except Exception as e:
print(f'[X] Failed to install {binary.name}: {e}')
# from plugins_extractor.curl.apps import CURL_BINARY
# print(CURL_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.wget.apps import WGET_BINARY
# print(WGET_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.ytdlp.apps import YTDLP_BINARY
# print(YTDLP_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.chrome.apps import CHROME_BINARY
# print(CHROME_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
# print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.readability.apps import READABILITY_BINARY
# print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.mercury.apps import MERCURY_BINARY
# print(MERCURY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exists():
stderr('\n[+] Creating new admin user for the Web UI...', color='green')
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
@ -978,6 +958,10 @@ def setup(out_dir: Path=DATA_DIR) -> None:
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
# backwards-compatibility:
setup = install
@enforce_types
def config(config_options_str: Optional[str]=None,
config_options: Optional[List[str]]=None,
@ -989,7 +973,7 @@ def config(config_options_str: Optional[str]=None,
from rich import print
check_data_folder(CONFIG)
check_data_folder()
if config_options and config_options_str:
stderr(
'[X] You should either pass config values as an arguments '
@ -1090,8 +1074,8 @@ def schedule(add: bool=False,
out_dir: Path=DATA_DIR):
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder(CONFIG)
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
check_data_folder()
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
@ -1228,7 +1212,7 @@ def server(runserver_args: Optional[List[str]]=None,
print()
check_data_folder(CONFIG)
check_data_folder()
from django.core.management import call_command
from django.contrib.auth.models import User
@ -1280,7 +1264,7 @@ def server(runserver_args: Optional[List[str]]=None,
def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
"""Run an ArchiveBox Django management command"""
check_data_folder(CONFIG)
check_data_folder()
from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
@ -1297,7 +1281,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
def shell(out_dir: Path=DATA_DIR) -> None:
"""Enter an interactive ArchiveBox Django shell"""
check_data_folder(CONFIG)
check_data_folder()
from django.core.management import call_command
call_command("shell_plus")

View file

@ -1,13 +1,11 @@
__package__ = 'archivebox.misc'
from benedict import benedict
from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
from .logging import stderr
def check_data_folder(config: benedict) -> None:
def check_data_folder() -> None:
archive_dir_exists = ARCHIVE_DIR.exists()
if not archive_dir_exists:
@ -23,7 +21,7 @@ def check_data_folder(config: benedict) -> None:
raise SystemExit(2)
def check_migrations(config: benedict):
def check_migrations():
from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]

View file

@ -1,10 +1,10 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional, Dict
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
@ -12,15 +12,26 @@ from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_CURL: bool = True
# USE_CURL: bool = Field(default=lambda c: c.SAVE_HEADERS or c.SAVE_FAVICON)
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=lambda c:
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
or FAVICON_CONFIG.SAVE_FAVICON
or c.SAVE_HEADERS
or c.SAVE_TITLE
)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
@ -35,12 +46,6 @@ CURL_CONFIG = CurlConfig()
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
brew.name: {
'abspath': lambda: bin_abspath(CURL_CONFIG.CURL_BINARY, PATH=f'/opt/homebrew/opt/curl/bin:{brew.PATH}'),
},
}
CURL_BINARY = CurlBinary()

View file

@ -1,13 +1,13 @@
__package__ = 'plugins_extractor.wget'
import sys
from typing import List, Optional, Dict
from typing import List, Optional
from pathlib import Path
from subprocess import run, DEVNULL
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
@ -80,12 +80,6 @@ WGET_CONFIG = WgetConfig()
class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
brew.name: {
'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
},
}
WGET_BINARY = WgetBinary()

View file

@ -11,7 +11,7 @@ from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr
from archivebox.config.legacy import ANSI
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
from archivebox.config import SEARCH_BACKEND_CONFIG
def log_index_started(url):
@ -58,13 +58,13 @@ def get_indexable_content(results: QuerySet):
def import_backend():
for backend in settings.SEARCH_BACKENDS.values():
if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE:
if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE:
return backend
raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend')
raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend')
@enforce_types
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND:
if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND:
return
if not skip_text_index and texts:
@ -86,7 +86,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
from core.models import Snapshot
if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND:
if SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
backend = import_backend()
try:
snapshot_pks = backend.search(query)
@ -106,7 +106,7 @@ def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
@enforce_types
def flush_search_index(snapshots: QuerySet):
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots:
if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
return
backend = import_backend()
snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))

@ -1 +1 @@
Subproject commit 4f9486ab86a65f83ad1bfd94320795b8e09871aa
Subproject commit 4f31b355fbf319a54b38953795b17b1b04db4348