mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 13:28:29 +00:00
finish migrating almost all config to new system
This commit is contained in:
parent
4b6a2a3e50
commit
d21bc86075
25 changed files with 246 additions and 349 deletions
|
@ -13,43 +13,6 @@ HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', '
|
|||
hook_type_names: Tuple[HookType] = get_args(HookType)
|
||||
|
||||
class BaseHook(BaseModel):
|
||||
"""
|
||||
A Plugin consists of a list of Hooks, applied to django.conf.settings when AppConfig.read() -> Plugin.register() is called.
|
||||
Plugin.register() then calls each Hook.register() on the provided settings.
|
||||
each Hook.regsiter() function (ideally pure) takes a django.conf.settings as input and returns a new one back.
|
||||
or
|
||||
it modifies django.conf.settings in-place to add changes corresponding to its HookType.
|
||||
e.g. for a HookType.CONFIG, the Hook.register() function places the hook in settings.CONFIG (and settings.HOOKS)
|
||||
An example of an impure Hook would be a CHECK that modifies settings but also calls django.core.checks.register(check).
|
||||
In practice any object that subclasses BaseHook and provides a .register() function can behave as a Hook.
|
||||
|
||||
setup_django() -> imports all settings.INSTALLED_APPS...
|
||||
# django imports AppConfig, models, migrations, admins, etc. for all installed apps
|
||||
# django then calls AppConfig.ready() on each installed app...
|
||||
|
||||
plugins_pkg.npm.NpmPlugin().AppConfig.ready() # called by django
|
||||
plugins_pkg.npm.NpmPlugin().register(settings) ->
|
||||
plugins_pkg.npm.NpmConfigSet().register(settings)
|
||||
abx.archivebox.base_configset.BaseConfigSet().register(settings)
|
||||
abx.archivebox.base_hook.BaseHook().register(settings, parent_plugin=plugins_pkg.npm.NpmPlugin())
|
||||
|
||||
...
|
||||
...
|
||||
|
||||
Both core ArchiveBox code and plugin code depend on python >= 3.10 and django >= 5.0 w/ sqlite and a filesystem.
|
||||
Core ArchiveBox code can depend only on python and the pip libraries it ships with, and can never depend on plugin code / node / other binaries.
|
||||
Plugin code can depend on archivebox core, other django apps, other pip libraries, and other plugins.
|
||||
Plugins can provide BinProviders + Binaries which can depend on arbitrary other binaries / package managers like curl / wget / yt-dlp / etc.
|
||||
|
||||
The execution interface between plugins is simply calling builtinplugins.npm.... functions directly, django handles
|
||||
importing all plugin code. There is no need to manually register methods/classes, only register to call
|
||||
impure setup functions or provide runtime state.
|
||||
settings.CONFIGS / settings.BINPROVIDERS / settings.BINARIES /... etc. are reserved for dynamic runtime state only.
|
||||
This state is exposed to the broader system in a flat namespace, e.g. CONFIG.IS_DOCKER=True, or BINARIES = [
|
||||
..., Binary('node', abspath='/usr/local/bin/node', version='22.2.0'), ...
|
||||
]
|
||||
|
||||
"""
|
||||
model_config = ConfigDict(
|
||||
extra="allow",
|
||||
arbitrary_types_allowed=True,
|
||||
|
|
|
@ -13,7 +13,7 @@ from ..main import (
|
|||
schedule,
|
||||
)
|
||||
from archivebox.misc.util import ansi_to_html
|
||||
from ..config.legacy import ONLY_NEW
|
||||
from archivebox.config import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
from .auth import API_AUTH_METHODS
|
||||
|
@ -58,7 +58,7 @@ class AddCommandSchema(Schema):
|
|||
urls: List[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
|
||||
update_all: bool = False
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
|
@ -68,7 +68,7 @@ class AddCommandSchema(Schema):
|
|||
|
||||
class UpdateCommandSchema(Schema):
|
||||
resume: Optional[float] = 0
|
||||
only_new: bool = ONLY_NEW
|
||||
only_new: bool = ARCHIVING_CONFIG.ONLY_NEW
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
after: Optional[float] = 0
|
||||
|
@ -85,7 +85,7 @@ class ScheduleCommandSchema(Schema):
|
|||
tag: str = ''
|
||||
depth: int = 0
|
||||
overwrite: bool = False
|
||||
update: bool = not ONLY_NEW
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
clear: bool = False
|
||||
|
||||
class ListCommandSchema(Schema):
|
||||
|
|
|
@ -152,18 +152,15 @@ def run_subcommand(subcommand: str,
|
|||
subcommand_args = subcommand_args or []
|
||||
|
||||
if subcommand not in meta_cmds:
|
||||
from ..config.legacy import setup_django, CONFIG
|
||||
from archivebox.config.legacy import setup_django
|
||||
|
||||
cmd_requires_db = subcommand in archive_cmds
|
||||
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
|
||||
|
||||
if cmd_requires_db:
|
||||
check_data_folder(CONFIG)
|
||||
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
|
||||
|
||||
if cmd_requires_db:
|
||||
check_migrations(CONFIG)
|
||||
check_migrations()
|
||||
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
from .constants import CONSTANTS, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
|
||||
from .defaults import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
|
@ -23,4 +23,5 @@ __all__ = [
|
|||
'SERVER_CONFIG',
|
||||
'ARCHIVING_CONFIG',
|
||||
'SEARCH_BACKEND_CONFIG',
|
||||
'CONSTANTS_CONFIG',
|
||||
]
|
||||
|
|
|
@ -60,6 +60,7 @@ from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CON
|
|||
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
|
||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
|
||||
|
||||
ANSI = SHELL_CONFIG.ANSI
|
||||
LDAP = LDAP_CONFIG.LDAP_ENABLED
|
||||
|
@ -81,9 +82,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
|
||||
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
|
||||
|
||||
'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
|
||||
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
|
||||
|
||||
'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
|
||||
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
|
||||
|
||||
# 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(),
|
||||
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -109,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
|
||||
'ARCHIVE_METHOD_OPTIONS': {
|
||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
|
||||
# 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
|
||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
|
@ -144,15 +147,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
]},
|
||||
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
|
||||
|
||||
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
||||
'--location',
|
||||
'--compressed'
|
||||
]},
|
||||
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
},
|
||||
|
||||
'DEPENDENCY_CONFIG': {
|
||||
|
@ -164,9 +158,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||
'USE_RIPGREP': {'type': bool, 'default': True},
|
||||
|
||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
# 'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||
# 'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||
# 'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||
|
@ -209,21 +203,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
|
||||
|
||||
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
# 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
||||
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
||||
|
||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
# 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
# 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
# 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
|
||||
|
||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
# 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
|
||||
|
@ -613,13 +598,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
|
|||
# 'is_valid': True,
|
||||
# },
|
||||
|
||||
'CURL_BINARY': {
|
||||
'path': bin_path(config['CURL_BINARY']),
|
||||
'version': config['CURL_VERSION'],
|
||||
'hash': bin_hash(config['CURL_BINARY']),
|
||||
'enabled': config['USE_CURL'],
|
||||
'is_valid': bool(config['CURL_VERSION']),
|
||||
},
|
||||
# 'CURL_BINARY': {
|
||||
# 'path': bin_path(config['CURL_BINARY']),
|
||||
# 'version': config['CURL_VERSION'],
|
||||
# 'hash': bin_hash(config['CURL_BINARY']),
|
||||
# 'enabled': config['USE_CURL'],
|
||||
# 'is_valid': bool(config['CURL_VERSION']),
|
||||
# },
|
||||
# 'WGET_BINARY': {
|
||||
# 'path': bin_path(config['WGET_BINARY']),
|
||||
# 'version': config['WGET_VERSION'],
|
||||
|
@ -641,13 +626,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
|
|||
# 'enabled': config['USE_MERCURY'],
|
||||
# 'is_valid': bool(config['MERCURY_VERSION']),
|
||||
# },
|
||||
'GIT_BINARY': {
|
||||
'path': bin_path(config['GIT_BINARY']),
|
||||
'version': config['GIT_VERSION'],
|
||||
'hash': bin_hash(config['GIT_BINARY']),
|
||||
'enabled': config['USE_GIT'],
|
||||
'is_valid': bool(config['GIT_VERSION']),
|
||||
},
|
||||
# 'GIT_BINARY': {
|
||||
# 'path': bin_path(config['GIT_BINARY']),
|
||||
# 'version': config['GIT_VERSION'],
|
||||
# 'hash': bin_hash(config['GIT_BINARY']),
|
||||
# 'enabled': config['USE_GIT'],
|
||||
# 'is_valid': bool(config['GIT_VERSION']),
|
||||
# },
|
||||
# 'SINGLEFILE_BINARY': {
|
||||
# 'path': bin_path(config['SINGLEFILE_BINARY']),
|
||||
# 'version': config['SINGLEFILE_VERSION'],
|
||||
|
|
|
@ -76,7 +76,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
|
||||
relevant_configs = {
|
||||
key: val
|
||||
for key, val in settings.CONFIG.items()
|
||||
for key, val in settings.FLAT_CONFIG.items()
|
||||
if '_BINARY' in key or '_VERSION' in key
|
||||
}
|
||||
|
||||
|
@ -105,6 +105,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
|
||||
for config_key, config_value in relevant_configs.items()
|
||||
if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
|
||||
or config_value.lower().endswith(binary.name.lower())
|
||||
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
|
||||
)))
|
||||
# if not binary.provider_overrides:
|
||||
|
|
|
@ -36,7 +36,7 @@ from main import remove
|
|||
from extractors import archive_links
|
||||
|
||||
|
||||
CONFIG = settings.CONFIG
|
||||
CONFIG = settings.FLAT_CONFIG
|
||||
|
||||
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
|
||||
from ..config.legacy import (
|
||||
LDAP
|
||||
)
|
||||
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
|
||||
|
||||
def register_signals():
|
||||
|
||||
if LDAP:
|
||||
if LDAP_CONFIG.LDAP_ENABLED:
|
||||
import django_auth_ldap.backend
|
||||
from .auth_ldap import create_user
|
||||
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
from ..config.legacy import (
|
||||
LDAP_CREATE_SUPERUSER
|
||||
)
|
||||
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
|
||||
|
||||
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
||||
if not user.id and LDAP_CREATE_SUPERUSER:
|
||||
if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
|
||||
user.is_superuser = True
|
||||
|
||||
user.is_staff = True
|
||||
|
|
|
@ -5,7 +5,7 @@ from django.utils import timezone
|
|||
from django.contrib.auth.middleware import RemoteUserMiddleware
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
|
||||
from ..config.legacy import PUBLIC_SNAPSHOTS, REVERSE_PROXY_USER_HEADER, REVERSE_PROXY_WHITELIST
|
||||
from archivebox.config import SERVER_CONFIG
|
||||
|
||||
|
||||
def detect_timezone(request, activate: bool=True):
|
||||
|
@ -32,7 +32,7 @@ def CacheControlMiddleware(get_response):
|
|||
response = get_response(request)
|
||||
|
||||
if '/archive/' in request.path or '/static/' in request.path:
|
||||
policy = 'public' if PUBLIC_SNAPSHOTS else 'private'
|
||||
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
|
||||
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
|
||||
# print('Set Cache-Control header to', response['Cache-Control'])
|
||||
return response
|
||||
|
@ -40,15 +40,15 @@ def CacheControlMiddleware(get_response):
|
|||
return middleware
|
||||
|
||||
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
|
||||
header = 'HTTP_{normalized}'.format(normalized=REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
|
||||
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
|
||||
|
||||
def process_request(self, request):
|
||||
if REVERSE_PROXY_WHITELIST == '':
|
||||
if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
|
||||
return
|
||||
|
||||
ip = request.META.get('REMOTE_ADDR')
|
||||
|
||||
for cidr in REVERSE_PROXY_WHITELIST.split(','):
|
||||
for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
|
||||
try:
|
||||
network = ipaddress.ip_network(cidr)
|
||||
except ValueError:
|
||||
|
|
|
@ -13,9 +13,7 @@ import abx.archivebox
|
|||
import abx.archivebox.use
|
||||
import abx.django.use
|
||||
|
||||
from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
|
||||
|
||||
from ..config.legacy import CONFIG
|
||||
from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG # noqa
|
||||
|
||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
|
||||
|
@ -80,7 +78,7 @@ LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/')
|
|||
PASSWORD_RESET_URL = '/accounts/password_reset/'
|
||||
APPEND_SLASH = True
|
||||
|
||||
DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
|
||||
DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv)
|
||||
|
||||
|
||||
INSTALLED_APPS = [
|
||||
|
@ -364,10 +362,10 @@ STORAGES = {
|
|||
### Security Settings
|
||||
################################################################################
|
||||
|
||||
SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
|
||||
SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
|
||||
|
||||
ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
|
||||
CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
|
||||
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',')
|
||||
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
|
||||
|
||||
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
|
||||
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
|
||||
|
|
|
@ -10,7 +10,7 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC
|
|||
from .serve_static import serve_static
|
||||
|
||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||
# from .config.legacy import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Callable
|
||||
from benedict import benedict
|
||||
import inspect
|
||||
from typing import Callable, get_type_hints
|
||||
from pathlib import Path
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
|
@ -27,21 +27,13 @@ from core.admin import result_url
|
|||
|
||||
from queues.tasks import bg_add
|
||||
|
||||
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
|
||||
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
||||
|
||||
from ..config.legacy import (
|
||||
CONFIG_SCHEMA,
|
||||
DYNAMIC_CONFIG_SCHEMA,
|
||||
USER_CONFIG,
|
||||
CONFIG,
|
||||
)
|
||||
from ..logging_util import printable_filesize
|
||||
from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||
from ..search import query_search_index
|
||||
from .serve_static import serve_static_with_byterange_support
|
||||
|
||||
CONFIG = benedict({**CONSTANTS, **CONFIG, **settings.FLAT_CONFIG})
|
||||
from .serve_static import serve_static_with_byterange_support
|
||||
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
||||
from ..logging_util import printable_filesize
|
||||
from ..search import query_search_index
|
||||
|
||||
|
||||
class HomepageView(View):
|
||||
|
@ -502,27 +494,43 @@ class HealthCheckView(View):
|
|||
|
||||
|
||||
def find_config_section(key: str) -> str:
|
||||
if key in CONSTANTS:
|
||||
if key in CONSTANTS_CONFIG:
|
||||
return 'CONSTANT'
|
||||
matching_sections = [
|
||||
name for name, opts in CONFIG_SCHEMA.items() if key in opts
|
||||
section.id for section in settings.CONFIGS.values() if key in section.model_fields
|
||||
]
|
||||
section = matching_sections[0] if matching_sections else 'DYNAMIC'
|
||||
return section
|
||||
|
||||
def find_config_default(key: str) -> str:
|
||||
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
|
||||
if key in CONSTANTS_CONFIG:
|
||||
return str(CONSTANTS_CONFIG[key])
|
||||
|
||||
default_val = None
|
||||
|
||||
for config in settings.CONFIGS.values():
|
||||
if key in config.model_fields:
|
||||
default_val = config.model_fields[key].default
|
||||
break
|
||||
|
||||
if isinstance(default_val, Callable):
|
||||
return None
|
||||
default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip()
|
||||
if default_val.count(')') > default_val.count('('):
|
||||
default_val = default_val[:-1]
|
||||
else:
|
||||
default_val = repr(default_val)
|
||||
default_val = str(default_val)
|
||||
|
||||
|
||||
return default_val
|
||||
|
||||
def find_config_type(key: str) -> str:
|
||||
if key in USER_CONFIG:
|
||||
return str(USER_CONFIG[key]['type'])
|
||||
elif key in DYNAMIC_CONFIG_SCHEMA:
|
||||
return str(type(CONFIG[key]))
|
||||
for config in settings.CONFIGS.values():
|
||||
if hasattr(config, key):
|
||||
type_hints = get_type_hints(config)
|
||||
try:
|
||||
return str(type_hints[key].__name__)
|
||||
except AttributeError:
|
||||
return str(type_hints[key])
|
||||
return 'str'
|
||||
|
||||
def key_is_safe(key: str) -> bool:
|
||||
|
@ -543,40 +551,29 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
"Value": [],
|
||||
"Default": [],
|
||||
# "Documentation": [],
|
||||
"Aliases": [],
|
||||
# "Aliases": [],
|
||||
}
|
||||
|
||||
for section in CONFIG_SCHEMA.keys():
|
||||
for key in CONFIG_SCHEMA[section].keys():
|
||||
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
|
||||
for section in reversed(list(settings.CONFIGS.values())):
|
||||
for key, field in section.model_fields.items():
|
||||
rows['Section'].append(section.id) # section.replace('_', ' ').title().replace(' Config', '')
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
|
||||
rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
|
||||
rows['Value'].append(mark_safe(f'<code>{getattr(section, key)}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
|
||||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
|
||||
|
||||
section = 'DYNAMIC'
|
||||
for key in DYNAMIC_CONFIG_SCHEMA.keys():
|
||||
if key in CONSTANTS:
|
||||
continue
|
||||
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
|
||||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
|
||||
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
|
||||
|
||||
|
||||
section = 'CONSTANT'
|
||||
for key in CONSTANTS.keys():
|
||||
for key in CONSTANTS_CONFIG.keys():
|
||||
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
|
||||
rows['Type'].append(format_html('<code>{}</code>', getattr(type(CONSTANTS_CONFIG[key]), '__name__', repr(CONSTANTS_CONFIG[key]))))
|
||||
rows['Value'].append(format_html('<code>{}</code>', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
|
||||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
|
||||
# rows['Aliases'].append('')
|
||||
|
||||
|
||||
return TableContext(
|
||||
|
@ -589,11 +586,12 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
|||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
|
||||
# aliases = USER_CONFIG.get(key, {}).get("aliases", [])
|
||||
aliases = []
|
||||
|
||||
if key in CONSTANTS:
|
||||
if key in CONSTANTS_CONFIG:
|
||||
section_header = mark_safe(f'[CONSTANTS] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, hardcoded by ArchiveBox)</small>')
|
||||
elif key in USER_CONFIG:
|
||||
elif key in settings.FLAT_CONFIG:
|
||||
section_header = mark_safe(f'data / ArchiveBox.conf [{find_config_section(key)}] <b><code style="color: lightgray">{key}</code></b>')
|
||||
else:
|
||||
section_header = mark_safe(f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, calculated at runtime)</small>')
|
||||
|
@ -609,7 +607,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
|||
"fields": {
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': CONFIG[key] if key_is_safe(key) else '********',
|
||||
'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********',
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
|
@ -619,25 +617,25 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
|||
</span>
|
||||
'''),
|
||||
'Type': mark_safe(f'''
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
|
||||
See full definition in <code>archivebox/config.py</code>...
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
|
||||
See full definition in <code>archivebox/config</code>...
|
||||
</a>
|
||||
'''),
|
||||
'Value': mark_safe(f'''
|
||||
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||
<br/><hr/><br/>
|
||||
Default:
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
|
||||
<code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
|
||||
</a>
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in USER_CONFIG else "none"}">
|
||||
<p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
|
||||
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||
<br/><br/>
|
||||
<code>archivebox config --set {key}="{
|
||||
val.strip("'")
|
||||
if (val := find_config_default(key)) else
|
||||
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
(repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
|
|
|
@ -7,21 +7,10 @@ from collections import defaultdict
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
)
|
||||
from ..config.legacy import (
|
||||
TIMEOUT,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
@ -39,27 +28,30 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
|
|||
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
return SAVE_ARCHIVE_DOT_ORG
|
||||
return ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
|
||||
|
||||
@enforce_types
|
||||
def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
|
||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||
|
||||
curl_binary = CURL_BINARY.load()
|
||||
assert curl_binary.abspath and curl_binary.version
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = get_output_path()
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
*CURL_CONFIG.CURL_ARGS,
|
||||
*CURL_CONFIG.CURL_EXTRA_ARGS,
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
|
||||
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
str(curl_binary.abspath),
|
||||
*dedupe(options),
|
||||
submit_url,
|
||||
]
|
||||
|
@ -97,22 +89,22 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
|||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CURL_VERSION,
|
||||
cmd_version=str(curl_binary.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
|
||||
def parse_archive_dot_org_response(response: str) -> Tuple[List[str], List[str]]:
|
||||
# Parse archive.org response headers
|
||||
headers: Dict[str, List[str]] = defaultdict(list)
|
||||
|
||||
# lowercase all the header names and store in dict
|
||||
for header in response.splitlines():
|
||||
if b':' not in header or not header.strip():
|
||||
if ':' not in header or not header.strip():
|
||||
continue
|
||||
name, val = header.decode().split(':', 1)
|
||||
name, val = header.split(':', 1)
|
||||
headers[name.lower().strip()].append(val.strip())
|
||||
|
||||
# Get successful archive url in "content-location" header or any errors
|
||||
|
|
|
@ -2,16 +2,11 @@ __package__ = 'archivebox.extractors'
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from archivebox.misc.system import chmod_file, run
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
domain,
|
||||
dedupe,
|
||||
)
|
||||
from ..config.legacy import CONFIG
|
||||
from archivebox.misc.util import enforce_types, domain, dedupe
|
||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
@ -22,7 +17,7 @@ def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite:
|
|||
if not overwrite and (out_dir / 'favicon.ico').exists():
|
||||
return False
|
||||
|
||||
return CONFIG.SAVE_FAVICON
|
||||
return FAVICON_CONFIG.SAVE_FAVICON
|
||||
|
||||
@enforce_types
|
||||
def get_output_path():
|
||||
|
@ -30,26 +25,29 @@ def get_output_path():
|
|||
|
||||
|
||||
@enforce_types
|
||||
def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
|
||||
def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
curl_binary = CURL_BINARY.load()
|
||||
assert curl_binary.abspath and curl_binary.version
|
||||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
assert out_dir.exists()
|
||||
|
||||
output: ArchiveOutput = 'favicon.ico'
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CONFIG.CURL_ARGS,
|
||||
*CONFIG.CURL_EXTRA_ARGS,
|
||||
*CURL_CONFIG.CURL_ARGS,
|
||||
*CURL_CONFIG.CURL_EXTRA_ARGS,
|
||||
'--max-time', str(timeout),
|
||||
'--output', str(output),
|
||||
*(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []),
|
||||
*([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
|
||||
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CONFIG.CURL_BINARY,
|
||||
str(curl_binary.abspath),
|
||||
*dedupe(options),
|
||||
CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
|
||||
FAVICON_CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
|
||||
]
|
||||
status = 'failed'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
|
@ -65,7 +63,7 @@ def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFI
|
|||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CONFIG.CURL_VERSION,
|
||||
cmd_version=str(curl_binary.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -4,7 +4,6 @@ __package__ = 'archivebox.extractors'
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
|
@ -14,8 +13,9 @@ from archivebox.misc.util import (
|
|||
without_query,
|
||||
without_fragment,
|
||||
)
|
||||
from ..config.legacy import CONFIG
|
||||
from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
|
||||
from ..logging_util import TimedProgress
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
|
||||
|
||||
def get_output_path():
|
||||
|
@ -42,28 +42,31 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
|||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
(domain(link.url) in CONFIG.GIT_DOMAINS)
|
||||
(domain(link.url) in GIT_CONFIG.GIT_DOMAINS)
|
||||
or (extension(link.url) == 'git')
|
||||
)
|
||||
if not is_clonable_url:
|
||||
return False
|
||||
|
||||
return CONFIG.SAVE_GIT
|
||||
return GIT_CONFIG.SAVE_GIT
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
|
||||
def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using git"""
|
||||
|
||||
git_binary = GIT_BINARY.load()
|
||||
assert git_binary.abspath and git_binary.version
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = get_output_path()
|
||||
output_path = out_dir / output
|
||||
output_path.mkdir(exist_ok=True)
|
||||
cmd = [
|
||||
CONFIG.GIT_BINARY,
|
||||
str(git_binary.abspath),
|
||||
'clone',
|
||||
*CONFIG.GIT_ARGS,
|
||||
*([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
|
||||
*GIT_CONFIG.GIT_ARGS,
|
||||
*([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
|
||||
without_query(without_fragment(link.url)),
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
@ -88,7 +91,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEO
|
|||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CONFIG.GIT_VERSION,
|
||||
cmd_version=str(git_binary.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -4,23 +4,14 @@ from pathlib import Path
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from archivebox.misc.system import atomic_write
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
get_headers,
|
||||
dedupe,
|
||||
)
|
||||
from ..config.legacy import (
|
||||
TIMEOUT,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_USER_AGENT,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_HEADERS
|
||||
)
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
def get_output_path():
|
||||
|
@ -29,34 +20,38 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
out_dir_path = Path(out_dir or link.link_dir)
|
||||
assert out_dir_path
|
||||
if not overwrite and (out_dir_path / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_HEADERS
|
||||
return CURL_CONFIG.SAVE_HEADERS
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
|
||||
"""Download site headers"""
|
||||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output_folder = out_dir.absolute()
|
||||
curl_binary = CURL_BINARY.load()
|
||||
assert curl_binary.abspath and curl_binary.version
|
||||
|
||||
out_dir_path = Path(out_dir or link.link_dir)
|
||||
output_folder = out_dir_path.absolute()
|
||||
output: ArchiveOutput = get_output_path()
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
timer = TimedProgress(timeout + 1, prefix=' ')
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
*CURL_CONFIG.CURL_ARGS,
|
||||
*CURL_CONFIG.CURL_EXTRA_ARGS,
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
|
||||
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
str(curl_binary.abspath),
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
|
@ -72,8 +67,8 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CURL_VERSION,
|
||||
pwd=str(out_dir_path),
|
||||
cmd_version=str(curl_binary.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -5,18 +5,13 @@ import io
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from archivebox.config import VERSION
|
||||
from ..config.legacy import (
|
||||
SAVE_HTMLTOTEXT,
|
||||
TIMEOUT,
|
||||
)
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
from archivebox.config import VERSION, ARCHIVING_CONFIG
|
||||
from archivebox.config.legacy import SAVE_HTMLTOTEXT
|
||||
from archivebox.misc.system import atomic_write
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
from archivebox.misc.util import enforce_types, is_static_file
|
||||
|
||||
from ..logging_util import TimedProgress
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from .title import get_html
|
||||
|
||||
|
||||
|
@ -122,7 +117,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=ARCHIVING_CONFIG.TIMEOUT) -> ArchiveResult:
|
||||
"""extract search-indexing-friendly text from an HTML document"""
|
||||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
|
|
|
@ -5,23 +5,14 @@ from html.parser import HTMLParser
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
download_url,
|
||||
htmldecode,
|
||||
dedupe,
|
||||
)
|
||||
from ..config.legacy import (
|
||||
TIMEOUT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_TITLE,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
@ -62,7 +53,7 @@ class TitleParser(HTMLParser):
|
|||
|
||||
|
||||
@enforce_types
|
||||
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||
def get_html(link: Link, path: Path, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> str:
|
||||
"""
|
||||
Try to find wget, singlefile and then dom files.
|
||||
If none is found, download the url again.
|
||||
|
@ -98,7 +89,7 @@ def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Option
|
|||
if not overwrite and link.title and not link.title.lower().startswith('http'):
|
||||
return False
|
||||
|
||||
return SAVE_TITLE
|
||||
return CURL_CONFIG.SAVE_TITLE
|
||||
|
||||
def extract_title_with_regex(html):
|
||||
match = re.search(HTML_TITLE_REGEX, html)
|
||||
|
@ -106,22 +97,25 @@ def extract_title_with_regex(html):
|
|||
return output
|
||||
|
||||
@enforce_types
|
||||
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
from core.models import Snapshot
|
||||
|
||||
curl_binary = CURL_BINARY.load()
|
||||
assert curl_binary.abspath and curl_binary.version
|
||||
|
||||
output: ArchiveOutput = None
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
*CURL_CONFIG.CURL_ARGS,
|
||||
*CURL_CONFIG.CURL_EXTRA_ARGS,
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
*(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
|
||||
*([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
str(curl_binary.abspath),
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
|
@ -161,7 +155,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=str(out_dir),
|
||||
cmd_version=CURL_VERSION,
|
||||
cmd_version=str(curl_binary.version),
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
|
|
|
@ -430,7 +430,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
|
|||
def status(out_dir: Path=DATA_DIR) -> None:
|
||||
"""Print out some info and statistics about the archive collection"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
from core.models import Snapshot
|
||||
from django.contrib.auth import get_user_model
|
||||
|
@ -573,7 +573,7 @@ def add(urls: Union[str, List[str]],
|
|||
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
|
||||
# Load list of links from the existing index
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
# worker = start_cli_workers()
|
||||
|
||||
|
@ -673,7 +673,7 @@ def remove(filter_str: Optional[str]=None,
|
|||
out_dir: Path=DATA_DIR) -> List[Link]:
|
||||
"""Remove the specified URLs from the archive"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
if snapshots is None:
|
||||
if filter_str and filter_patterns:
|
||||
|
@ -762,7 +762,7 @@ def update(resume: Optional[float]=None,
|
|||
# from .queues.supervisor_util import start_cli_workers
|
||||
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
# start_cli_workers()
|
||||
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||
|
||||
|
@ -833,7 +833,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
|||
out_dir: Path=DATA_DIR) -> Iterable[Link]:
|
||||
"""List, filter, and export information about archive entries"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
if filter_patterns and filter_patterns_str:
|
||||
stderr(
|
||||
|
@ -881,7 +881,7 @@ def list_links(snapshots: Optional[QuerySet]=None,
|
|||
before: Optional[float]=None,
|
||||
out_dir: Path=DATA_DIR) -> Iterable[Link]:
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
if snapshots:
|
||||
all_snapshots = snapshots
|
||||
|
@ -905,7 +905,7 @@ def list_folders(links: List[Link],
|
|||
status: str,
|
||||
out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]:
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
STATUS_FUNCTIONS = {
|
||||
"indexed": get_indexed_folders,
|
||||
|
@ -926,7 +926,7 @@ def list_folders(links: List[Link],
|
|||
raise ValueError('Status not recognized.')
|
||||
|
||||
@enforce_types
|
||||
def setup(out_dir: Path=DATA_DIR) -> None:
|
||||
def install(out_dir: Path=DATA_DIR) -> None:
|
||||
"""Automatically install all ArchiveBox dependencies and extras"""
|
||||
|
||||
from rich import print
|
||||
|
@ -937,40 +937,20 @@ def setup(out_dir: Path=DATA_DIR) -> None:
|
|||
|
||||
stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
|
||||
|
||||
for binary in settings.BINARIES.values():
|
||||
for binary in reversed(list(settings.BINARIES.values())):
|
||||
try:
|
||||
print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
except Exception as e:
|
||||
print(f'[X] Failed to install {binary.name}: {e}')
|
||||
|
||||
# from plugins_extractor.curl.apps import CURL_BINARY
|
||||
# print(CURL_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
|
||||
# from plugins_extractor.wget.apps import WGET_BINARY
|
||||
# print(WGET_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
|
||||
# from plugins_extractor.ytdlp.apps import YTDLP_BINARY
|
||||
# print(YTDLP_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
|
||||
# from plugins_extractor.chrome.apps import CHROME_BINARY
|
||||
# print(CHROME_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
|
||||
# from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
|
||||
# print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
|
||||
# from plugins_extractor.readability.apps import READABILITY_BINARY
|
||||
# print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
|
||||
# from plugins_extractor.mercury.apps import MERCURY_BINARY
|
||||
# print(MERCURY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
|
||||
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exists():
|
||||
stderr('\n[+] Creating new admin user for the Web UI...', color='green')
|
||||
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||
stderr(' archivebox manage createsuperuser')
|
||||
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||
|
||||
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
|
||||
|
||||
|
@ -978,6 +958,10 @@ def setup(out_dir: Path=DATA_DIR) -> None:
|
|||
|
||||
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
|
||||
|
||||
# backwards-compatibility:
|
||||
setup = install
|
||||
|
||||
|
||||
@enforce_types
|
||||
def config(config_options_str: Optional[str]=None,
|
||||
config_options: Optional[List[str]]=None,
|
||||
|
@ -989,7 +973,7 @@ def config(config_options_str: Optional[str]=None,
|
|||
|
||||
from rich import print
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
if config_options and config_options_str:
|
||||
stderr(
|
||||
'[X] You should either pass config values as an arguments '
|
||||
|
@ -1090,8 +1074,8 @@ def schedule(add: bool=False,
|
|||
out_dir: Path=DATA_DIR):
|
||||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
check_data_folder()
|
||||
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
|
@ -1228,7 +1212,7 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
print()
|
||||
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.contrib.auth.models import User
|
||||
|
@ -1280,7 +1264,7 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
|
||||
"""Run an ArchiveBox Django management command"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
from django.core.management import execute_from_command_line
|
||||
|
||||
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
|
||||
|
@ -1297,7 +1281,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
|
|||
def shell(out_dir: Path=DATA_DIR) -> None:
|
||||
"""Enter an interactive ArchiveBox Django shell"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
check_data_folder()
|
||||
|
||||
from django.core.management import call_command
|
||||
call_command("shell_plus")
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
__package__ = 'archivebox.misc'
|
||||
|
||||
from benedict import benedict
|
||||
|
||||
from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
|
||||
|
||||
from .logging import stderr
|
||||
|
||||
|
||||
def check_data_folder(config: benedict) -> None:
|
||||
def check_data_folder() -> None:
|
||||
|
||||
archive_dir_exists = ARCHIVE_DIR.exists()
|
||||
if not archive_dir_exists:
|
||||
|
@ -23,7 +21,7 @@ def check_data_folder(config: benedict) -> None:
|
|||
raise SystemExit(2)
|
||||
|
||||
|
||||
def check_migrations(config: benedict):
|
||||
def check_migrations():
|
||||
from ..index.sql import list_migrations
|
||||
|
||||
pending_migrations = [name for status, name in list_migrations() if not status]
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
__package__ = 'plugins_extractor.curl'
|
||||
|
||||
from typing import List, Optional, Dict
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
@ -12,15 +12,26 @@ from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
|||
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from archivebox.config import ARCHIVING_CONFIG
|
||||
|
||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
||||
|
||||
class CurlConfig(BaseConfigSet):
|
||||
|
||||
SAVE_CURL: bool = True
|
||||
|
||||
# USE_CURL: bool = Field(default=lambda c: c.SAVE_HEADERS or c.SAVE_FAVICON)
|
||||
SAVE_TITLE: bool = Field(default=True)
|
||||
SAVE_HEADERS: bool = Field(default=True)
|
||||
USE_CURL: bool = Field(default=lambda c:
|
||||
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
|
||||
or FAVICON_CONFIG.SAVE_FAVICON
|
||||
or c.SAVE_HEADERS
|
||||
or c.SAVE_TITLE
|
||||
)
|
||||
|
||||
CURL_BINARY: str = Field(default='curl')
|
||||
CURL_ARGS: List[str] = [
|
||||
'--silent',
|
||||
'--location',
|
||||
'--compressed',
|
||||
]
|
||||
CURL_EXTRA_ARGS: List[str] = []
|
||||
|
||||
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
|
@ -35,12 +46,6 @@ CURL_CONFIG = CurlConfig()
|
|||
class CurlBinary(BaseBinary):
|
||||
name: BinName = CURL_CONFIG.CURL_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||
brew.name: {
|
||||
'abspath': lambda: bin_abspath(CURL_CONFIG.CURL_BINARY, PATH=f'/opt/homebrew/opt/curl/bin:{brew.PATH}'),
|
||||
},
|
||||
}
|
||||
|
||||
CURL_BINARY = CurlBinary()
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
__package__ = 'plugins_extractor.wget'
|
||||
|
||||
import sys
|
||||
from typing import List, Optional, Dict
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
from subprocess import run, DEVNULL
|
||||
|
||||
from rich import print
|
||||
from pydantic import InstanceOf, Field, model_validator
|
||||
from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
@ -80,12 +80,6 @@ WGET_CONFIG = WgetConfig()
|
|||
class WgetBinary(BaseBinary):
|
||||
name: BinName = WGET_CONFIG.WGET_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||
brew.name: {
|
||||
'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
|
||||
},
|
||||
}
|
||||
|
||||
WGET_BINARY = WgetBinary()
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from archivebox.misc.util import enforce_types
|
|||
from archivebox.misc.logging import stderr
|
||||
from archivebox.config.legacy import ANSI
|
||||
|
||||
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
|
||||
from archivebox.config import SEARCH_BACKEND_CONFIG
|
||||
|
||||
|
||||
def log_index_started(url):
|
||||
|
@ -58,13 +58,13 @@ def get_indexable_content(results: QuerySet):
|
|||
|
||||
def import_backend():
|
||||
for backend in settings.SEARCH_BACKENDS.values():
|
||||
if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE:
|
||||
if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE:
|
||||
return backend
|
||||
raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend')
|
||||
raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend')
|
||||
|
||||
@enforce_types
|
||||
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
|
||||
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND:
|
||||
if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND:
|
||||
return
|
||||
|
||||
if not skip_text_index and texts:
|
||||
|
@ -86,7 +86,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
|
|||
def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
|
||||
from core.models import Snapshot
|
||||
|
||||
if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND:
|
||||
if SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
|
||||
backend = import_backend()
|
||||
try:
|
||||
snapshot_pks = backend.search(query)
|
||||
|
@ -106,7 +106,7 @@ def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
|
|||
|
||||
@enforce_types
|
||||
def flush_search_index(snapshots: QuerySet):
|
||||
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots:
|
||||
if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
|
||||
return
|
||||
backend = import_backend()
|
||||
snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
|
||||
|
|
2
archivebox/vendor/pydantic-pkgr
vendored
2
archivebox/vendor/pydantic-pkgr
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 4f9486ab86a65f83ad1bfd94320795b8e09871aa
|
||||
Subproject commit 4f31b355fbf319a54b38953795b17b1b04db4348
|
Loading…
Add table
Reference in a new issue