From 0285aa52a03b90e03ed3bc49d932f766adf06acd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 18:31:21 -0700 Subject: [PATCH] config and attr access improvements --- archivebox/abid_utils/abid.py | 2 +- archivebox/abid_utils/models.py | 19 +++--- archivebox/api/auth.py | 9 ++- archivebox/api/models.py | 26 ++++---- archivebox/api/v1_core.py | 12 ++-- archivebox/config.py | 45 +++++++++++--- archivebox/config_stubs.py | 7 ++- archivebox/core/admin.py | 46 +++++++------- archivebox/core/models.py | 41 +++++++------ archivebox/core/settings.py | 102 +++++++++++++------------------ archivebox/core/urls.py | 2 +- archivebox/extractors/favicon.py | 39 +++++------- archivebox/extractors/git.py | 24 +++----- archivebox/index/schema.py | 12 ++-- archivebox/manage.py | 4 +- 15 files changed, 203 insertions(+), 187 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index 3c90e83c..a0e71937 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -115,7 +115,7 @@ def uri_hash(uri: Union[str, bytes], salt: str=DEFAULT_ABID_URI_SALT) -> str: if isinstance(uri, bytes): uri_str: str = uri.decode() else: - uri_str = uri + uri_str = str(uri) # only hash the domain part of URLs if '://' in uri_str: diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index c27f85ec..054336c5 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -15,6 +15,7 @@ from charidfield import CharIDField # type: ignore[import-untyped] from django.conf import settings from django.db import models +from django.utils import timezone from django.db.utils import OperationalError from django.contrib.auth import get_user_model @@ -115,7 +116,8 @@ class ABIDModel(models.Model): raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') if not ts: - ts = datetime.utcfromtimestamp(0) + # default to unix epoch with 00:00:00 UTC + ts = datetime.fromtimestamp(0, timezone.utc) # equivalent to: ts = datetime.utcfromtimestamp(0) print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) if not uri: @@ -146,7 +148,13 @@ class ABIDModel(models.Model): """ ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') """ - abid = None + + # if object is not yet saved to DB, always generate fresh ABID from values + if self._state.adding: + return self.generate_abid() + + # otherwise DB is single source of truth, load ABID from existing db pk + abid: ABID | None = None try: abid = abid or ABID.parse(self.pk) except Exception: @@ -158,12 +166,7 @@ class ABIDModel(models.Model): pass try: - abid = abid or ABID.parse(self.uuid) - except Exception: - pass - - try: - abid = abid or ABID.parse(self.abid) + abid = abid or ABID.parse(cast(str, self.abid)) except Exception: pass diff --git a/archivebox/api/auth.py b/archivebox/api/auth.py index efa7d103..1af564e2 100644 --- a/archivebox/api/auth.py +++ b/archivebox/api/auth.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.api' -from typing import Optional +from typing import Optional, cast from django.http import HttpRequest from django.contrib.auth import login @@ -18,12 +18,13 @@ def auth_using_token(token, request: Optional[HttpRequest]=None) -> Optional[Abs submitted_empty_form = token in ('string', '', None) if submitted_empty_form: + assert request is not None, 'No request provided for API key authentication' user = request.user # see if user is authed via django session and use that as the default else: try: token = APIToken.objects.get(token=token) if token.is_valid(): - user = token.user + user = token.created_by except APIToken.DoesNotExist: pass @@ -38,6 +39,7 @@ def auth_using_password(username, password, request: Optional[HttpRequest]=None) submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None)) if submitted_empty_form: + assert request is not None, 'No request provided for API key authentication' user = request.user # see if user is authed via django session and use that as the default else: user = authenticate( @@ -47,8 +49,9 @@ def auth_using_password(username, password, request: Optional[HttpRequest]=None) if not user: print('[❌] Failed to authenticate API user using API Key:', request) + user = None - return user + return cast(AbstractBaseUser | None, user) ### Base Auth Types diff --git a/archivebox/api/models.py b/archivebox/api/models.py index b3861000..dfa6d3dc 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -12,7 +12,8 @@ from signal_webhooks.models import WebhookBase from django_stubs_ext.db.models import TypedModelMeta -from abid_utils.models import ABIDModel, ABIDField +from abid_utils.models import ABIDModel, ABIDField, get_or_create_system_user_pk + def generate_secret_token() -> str: @@ -32,15 +33,13 @@ class APIToken(ABIDModel): abid_rand_src = 'self.id' id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - uuid = models.UUIDField(blank=True, null=True, editable=False, unique=True) abid = ABIDField(prefix=abid_prefix) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE) - token = models.CharField(max_length=32, default=generate_secret_token, unique=True) - + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) created = models.DateTimeField(auto_now_add=True) + + token = models.CharField(max_length=32, default=generate_secret_token, unique=True) expires = models.DateTimeField(null=True, blank=True) - class Meta(TypedModelMeta): verbose_name = "API Key" @@ -50,7 +49,7 @@ class APIToken(ABIDModel): return self.token def __repr__(self) -> str: - return f'' + return f'' def __json__(self) -> dict: return { @@ -63,10 +62,6 @@ class APIToken(ABIDModel): "expires": self.expires_as_iso8601, } - @property - def ulid(self): - return self.get_abid().ulid - @property def expires_as_iso8601(self): """Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none.""" @@ -100,10 +95,15 @@ class OutboundWebhook(ABIDModel, WebhookBase): abid_subtype_src = 'self.ref' abid_rand_src = 'self.id' - id = models.UUIDField(blank=True, null=True, unique=True, editable=True) - uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) abid = ABIDField(prefix=abid_prefix) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) + + # More fields here: WebhookBase... + WebhookBase._meta.get_field('name').help_text = ( 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).') WebhookBase._meta.get_field('signal').help_text = ( diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 0c701104..38510cd8 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -309,9 +309,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # snapshot = Snapshot.objects.create(**payload.dict()) # return snapshot # -# @router.put("/snapshot/{snapshot_uuid}", response=SnapshotSchema) -# def update_snapshot(request, snapshot_uuid: str, payload: SnapshotSchema): -# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid) +# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema) +# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema): +# snapshot = get_object_or_404(Snapshot, uuid=snapshot_id) # # for attr, value in payload.dict().items(): # setattr(snapshot, attr, value) @@ -319,9 +319,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # # return snapshot # -# @router.delete("/snapshot/{snapshot_uuid}") -# def delete_snapshot(request, snapshot_uuid: str): -# snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid) +# @router.delete("/snapshot/{snapshot_id}") +# def delete_snapshot(request, snapshot_id: str): +# snapshot = get_object_or_404(Snapshot, uuid=snapshot_id) # snapshot.delete() # return {"success": True} diff --git a/archivebox/config.py b/archivebox/config.py index afa334c6..aac32756 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -44,6 +44,7 @@ from collections import defaultdict import importlib.metadata from .config_stubs import ( + AttrDict, SimpleConfigValueDict, ConfigValue, ConfigDict, @@ -379,6 +380,29 @@ ALLOWED_IN_OUTPUT_DIR = { ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE +CONSTANTS = { + "PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME}, + "TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME}, + "ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME}, + "SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME}, + "LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME}, + "CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME}, + "PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME}, + "CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME}, + "SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME}, + "JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME}, + "HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME}, + "ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME}, + "FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME}, + "CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME}, + "DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS}, + "ANSI": {'default': lambda c: ANSI}, + "COLOR_DICT": {'default': lambda c: COLOR_DICT}, + "STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS}, + "ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR}, + "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS}, +} + ############################## Version Config ################################## def get_system_user() -> str: @@ -498,9 +522,13 @@ def can_upgrade(config): ############################## Derived Config ################################## + + # These are derived/computed values calculated *after* all user-provided config values are ingested # they appear in `archivebox config` output and are intended to be read-only for the user DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { + **CONSTANTS, + 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns}, 'USER': {'default': lambda c: get_system_user()}, 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, @@ -678,28 +706,29 @@ def load_config_val(key: str, raise Exception('Config values can only be str, bool, int, or json') -def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: +def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]: """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() + assert out_dir and out_dir.is_dir() config_path = Path(out_dir) / CONFIG_FILENAME if config_path.exists(): config_file = ConfigParser() config_file.optionxform = str config_file.read(config_path) # flatten into one namespace - config_file_vars = { + config_file_vars = ConfigDict({ key.upper(): val for section, options in config_file.items() for key, val in options.items() - } + }) # print('[i] Loaded config file', os.path.abspath(config_path)) # print(config_file_vars) return config_file_vars return None -def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: +def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict: """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" from .system import atomic_write @@ -740,7 +769,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: existing_config = dict(config_file[section]) else: existing_config = {} - config_file[section] = {**existing_config, key: val} + config_file[section] = ConfigDict({**existing_config, key: val}) # always make sure there's a SECRET_KEY defined for Django existing_secret_key = None @@ -815,7 +844,7 @@ def load_config(defaults: ConfigDefaultDict, # raise raise SystemExit(2) - return extended_config + return AttrDict(extended_config) def parse_version_string(version: str) -> Tuple[int, int, int]: @@ -1198,14 +1227,14 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue: def load_all_config(): - CONFIG: ConfigDict = {} + CONFIG: ConfigDict = ConfigDict() for section_name, section_config in CONFIG_SCHEMA.items(): CONFIG = load_config(section_config, CONFIG) return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG) # add all final config values in CONFIG to globals in this file -CONFIG = load_all_config() +CONFIG: ConfigDict = load_all_config() globals().update(CONFIG) # this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ... diff --git a/archivebox/config_stubs.py b/archivebox/config_stubs.py index c8cc9ecb..d4bca2d6 100644 --- a/archivebox/config_stubs.py +++ b/archivebox/config_stubs.py @@ -9,11 +9,15 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue] SimpleConfigValueGetter = Callable[[], SimpleConfigValue] ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter] +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__dict__ = self class BaseConfig(TypedDict): pass -class ConfigDict(BaseConfig, total=False): +class ConfigDict(BaseConfig, AttrDict, total=False): """ # Regenerate by pasting this quine into `archivebox shell` 🥚 from archivebox.config import ConfigDict, CONFIG_DEFAULTS @@ -28,6 +32,7 @@ class ConfigDict(BaseConfig, total=False): print(f' {key}: {Type.__name__}') print() """ + IS_TTY: bool USE_COLOR: bool SHOW_PROGRESS: bool diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 36ed74df..530e9b71 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -7,6 +7,7 @@ from io import StringIO from pathlib import Path from contextlib import redirect_stdout from datetime import datetime, timezone +from typing import Dict, Any from django.contrib import admin from django.db.models import Count, Q @@ -16,10 +17,12 @@ from django.utils.safestring import mark_safe from django.shortcuts import render, redirect from django.contrib.auth import get_user_model from django.core.exceptions import ValidationError +from django.conf import settings from django import forms -from signal_webhooks.admin import WebhookAdmin, get_webhook_model +from signal_webhooks.admin import WebhookAdmin +from signal_webhooks.utils import get_webhook_model # from plugantic.admin import CustomPlugin from ..util import htmldecode, urldecode, ansi_to_html @@ -34,16 +37,11 @@ from index.html import snapshot_icons from logging_util import printable_filesize from main import add, remove from extractors import archive_links -from config import ( - OUTPUT_DIR, - SNAPSHOTS_PER_PAGE, - VERSION, - VERSIONS_AVAILABLE, - CAN_UPGRADE -) -GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} +CONFIG = settings.CONFIG + +GLOBAL_CONTEXT = {'VERSION': CONFIG.VERSION, 'VERSIONS_AVAILABLE': CONFIG.VERSIONS_AVAILABLE, 'CAN_UPGRADE': CONFIG.CAN_UPGRADE} # Admin URLs # /admin/ @@ -74,7 +72,7 @@ class ArchiveBoxAdmin(admin.AdminSite): return redirect(f'/admin/login/?next={request.path}') request.current_app = self.name - context = { + context: Dict[str, Any] = { **self.each_context(request), 'title': 'Add URLs', } @@ -92,7 +90,7 @@ class ArchiveBoxAdmin(admin.AdminSite): "urls": url, "depth": depth, "update_all": False, - "out_dir": OUTPUT_DIR, + "out_dir": CONFIG.OUTPUT_DIR, } add_stdout = StringIO() with redirect_stdout(add_stdout): @@ -101,7 +99,7 @@ class ArchiveBoxAdmin(admin.AdminSite): context.update({ "stdout": ansi_to_html(add_stdout.getvalue().strip()), - "form": AddLinkForm() + "form": AddLinkForm(), }) else: context["form"] = form @@ -118,12 +116,14 @@ archivebox_admin.disable_action('delete_selected') # archivebox_admin.register(CustomPlugin) # patch admin with methods to add data views (implemented by admin_data_views package) +# https://github.com/MrThearMan/django-admin-data-views +# https://mrthearman.github.io/django-admin-data-views/setup/ ############### Additional sections are defined in settings.ADMIN_DATA_VIEWS ######### from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin) -archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) -archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) +archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore +archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin) @@ -146,7 +146,7 @@ class ArchiveResultInline(admin.TabularInline): class TagInline(admin.TabularInline): - model = Tag.snapshot_set.through + model = Tag.snapshot_set.through # type: ignore # fk_name = 'snapshot' fields = ('id', 'tag') extra = 1 @@ -241,7 +241,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] autocomplete_fields = ['tags'] inlines = [TagInline, ArchiveResultInline] - list_per_page = SNAPSHOTS_PER_PAGE + list_per_page = CONFIG.SNAPSHOTS_PER_PAGE action_form = SnapshotActionForm @@ -433,7 +433,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): # Monkey patch here plus core_tags.py self.change_list_template = 'private_index_grid.html' - self.list_per_page = SNAPSHOTS_PER_PAGE + self.list_per_page = CONFIG.SNAPSHOTS_PER_PAGE self.list_max_show_all = self.list_per_page # Call monkey patched view @@ -458,7 +458,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): archive_links([ snapshot.as_link() for snapshot in queryset - ], out_dir=OUTPUT_DIR) + ], out_dir=CONFIG.OUTPUT_DIR) @admin.action( description="⬇️ Title" @@ -467,7 +467,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): archive_links([ snapshot.as_link() for snapshot in queryset - ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR) + ], overwrite=True, methods=('title','favicon'), out_dir=CONFIG.OUTPUT_DIR) @admin.action( description="Re-Snapshot" @@ -485,13 +485,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): archive_links([ snapshot.as_link() for snapshot in queryset - ], overwrite=True, out_dir=OUTPUT_DIR) + ], overwrite=True, out_dir=CONFIG.OUTPUT_DIR) @admin.action( description="Delete" ) def delete_snapshots(self, request, queryset): - remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) + remove(snapshots=queryset, yes=True, delete=True, out_dir=CONFIG.OUTPUT_DIR) @admin.action( @@ -578,7 +578,7 @@ class ArchiveResultAdmin(admin.ModelAdmin): list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') ordering = ['-start_ts'] - list_per_page = SNAPSHOTS_PER_PAGE + list_per_page = CONFIG.SNAPSHOTS_PER_PAGE @admin.display( description='Snapshot Info' @@ -620,7 +620,7 @@ class ArchiveResultAdmin(admin.ModelAdmin): ) def output_summary(self, result): - snapshot_dir = Path(OUTPUT_DIR) / str(result.pwd).split('data/', 1)[-1] + snapshot_dir = Path(CONFIG.OUTPUT_DIR) / str(result.pwd).split('data/', 1)[-1] output_str = format_html( '
{}

', result.output, diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 372e68a0..a8a2522c 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,7 +1,7 @@ __package__ = 'archivebox.core' -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Iterable from django_stubs_ext.db.models import TypedModelMeta import json @@ -17,10 +17,10 @@ from django.utils.text import slugify from django.core.cache import cache from django.urls import reverse, reverse_lazy from django.db.models import Case, When, Value, IntegerField +from django.conf import settings from abid_utils.models import ABIDModel, ABIDField -from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME from ..system import get_dir_size from ..util import parse_date, base_url from ..index.schema import Link @@ -72,6 +72,7 @@ class Tag(ABIDModel): slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) # slug is autoset on save from name, never set it manually + snapshot_set: models.Manager['Snapshot'] class Meta(TypedModelMeta): verbose_name = "Tag" @@ -154,6 +155,8 @@ class Snapshot(ABIDModel): keys = ('url', 'timestamp', 'title', 'tags', 'updated') + archiveresult_set: models.Manager['ArchiveResult'] + @property def uuid(self): return self.id @@ -246,11 +249,11 @@ class Snapshot(ABIDModel): @cached_property def link_dir(self): - return str(ARCHIVE_DIR / self.timestamp) + return str(settings.CONFIG.ARCHIVE_DIR / self.timestamp) @cached_property def archive_path(self): - return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) + return '{}/{}'.format(settings.CONFIG.ARCHIVE_DIR_NAME, self.timestamp) @cached_property def archive_size(self): @@ -284,7 +287,7 @@ class Snapshot(ABIDModel): @cached_property def status_code(self) -> Optional[str]: - return self.headers and self.headers.get('Status-Code') + return self.headers.get('Status-Code') if self.headers else None @cached_property def history(self) -> dict: @@ -322,7 +325,7 @@ class Snapshot(ABIDModel): return None - def save_tags(self, tags: List[str]=()) -> None: + def save_tags(self, tags: Iterable[str]=()) -> None: tags_id = [] for tag in tags: if tag.strip(): @@ -334,17 +337,17 @@ class Snapshot(ABIDModel): # def get_storage_dir(self, create=True, symlink=True) -> Path: # date_str = self.added.strftime('%Y%m%d') # domain_str = domain(self.url) - # abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) + # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) # if create and not abs_storage_dir.is_dir(): # abs_storage_dir.mkdir(parents=True, exist_ok=True) # if symlink: # LINK_PATHS = [ - # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), - # # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid), - # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid), - # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid), + # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid), + # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid), + # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid), # ] # for link_path in LINK_PATHS: # link_path.parent.mkdir(parents=True, exist_ok=True) @@ -439,8 +442,8 @@ class ArchiveResult(ABIDModel): should be used for user-facing iframe embeds of this result """ - if hasattr(self.extractor_module, 'get_embed_path'): - return self.extractor_module.get_embed_path(self) + if get_embed_path_func := getattr(self.extractor_module, 'get_embed_path', None): + return get_embed_path_func(self) return self.extractor_module.get_output_path() @@ -455,18 +458,18 @@ class ArchiveResult(ABIDModel): # def get_storage_dir(self, create=True, symlink=True): # date_str = self.snapshot.added.strftime('%Y%m%d') # domain_str = domain(self.snapshot.url) - # abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid) + # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid) # if create and not abs_storage_dir.is_dir(): # abs_storage_dir.mkdir(parents=True, exist_ok=True) # if symlink: # LINK_PATHS = [ - # Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), - # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid), - # # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid), - # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid), - # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid), + # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid), + # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid), + # # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid), + # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid), + # Path(settings.CONFIG.ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid), # ] # for link_path in LINK_PATHS: # link_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index cac65ee6..da03ffd8 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -9,32 +9,9 @@ import tempfile from pathlib import Path from django.utils.crypto import get_random_string -from ..config import ( - CONFIG, - DEBUG, - SECRET_KEY, - ALLOWED_HOSTS, - PACKAGE_DIR, - TEMPLATES_DIR_NAME, - CUSTOM_TEMPLATES_DIR, - SQL_INDEX_FILENAME, - OUTPUT_DIR, - ARCHIVE_DIR, - LOGS_DIR, - CACHE_DIR, - TIMEZONE, - - LDAP, - LDAP_SERVER_URI, - LDAP_BIND_DN, - LDAP_BIND_PASSWORD, - LDAP_USER_BASE, - LDAP_USER_FILTER, - LDAP_USERNAME_ATTR, - LDAP_FIRSTNAME_ATTR, - LDAP_LASTNAME_ATTR, - LDAP_EMAIL_ATTR, -) +from ..config import CONFIG +from ..config_stubs import AttrDict +assert isinstance(CONFIG, AttrDict) IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ @@ -53,12 +30,12 @@ LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/') PASSWORD_RESET_URL = '/accounts/password_reset/' APPEND_SLASH = True -DEBUG = DEBUG or ('--debug' in sys.argv) +DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv) # add plugins folders to system path, and load plugins in installed_apps -BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins' -USER_PLUGINS_DIR = OUTPUT_DIR / 'plugins' +BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'plugins' +USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'plugins' sys.path.insert(0, str(BUILTIN_PLUGINS_DIR)) sys.path.insert(0, str(USER_PLUGINS_DIR)) @@ -127,7 +104,7 @@ AUTHENTICATION_BACKENDS = [ 'django.contrib.auth.backends.ModelBackend', ] -if LDAP: +if CONFIG.LDAP: try: import ldap from django_auth_ldap.config import LDAPSearch @@ -138,23 +115,23 @@ if LDAP: global AUTH_LDAP_USER_SEARCH global AUTH_LDAP_USER_ATTR_MAP - AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI - AUTH_LDAP_BIND_DN = LDAP_BIND_DN - AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD + AUTH_LDAP_SERVER_URI = CONFIG.LDAP_SERVER_URI + AUTH_LDAP_BIND_DN = CONFIG.LDAP_BIND_DN + AUTH_LDAP_BIND_PASSWORD = CONFIG.LDAP_BIND_PASSWORD - assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True' + assert AUTH_LDAP_SERVER_URI and CONFIG.LDAP_USERNAME_ATTR and CONFIG.LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True' AUTH_LDAP_USER_SEARCH = LDAPSearch( - LDAP_USER_BASE, + CONFIG.LDAP_USER_BASE, ldap.SCOPE_SUBTREE, - '(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')', + '(&(' + CONFIG.LDAP_USERNAME_ATTR + '=%(user)s)' + CONFIG.LDAP_USER_FILTER + ')', ) AUTH_LDAP_USER_ATTR_MAP = { - 'username': LDAP_USERNAME_ATTR, - 'first_name': LDAP_FIRSTNAME_ATTR, - 'last_name': LDAP_LASTNAME_ATTR, - 'email': LDAP_EMAIL_ATTR, + 'username': CONFIG.LDAP_USERNAME_ATTR, + 'first_name': CONFIG.LDAP_FIRSTNAME_ATTR, + 'last_name': CONFIG.LDAP_LASTNAME_ATTR, + 'email': CONFIG.LDAP_EMAIL_ATTR, } AUTHENTICATION_BACKENDS = [ @@ -206,6 +183,15 @@ if DEBUG_TOOLBAR: ] MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware'] +if DEBUG: + from django_autotyping.typing import AutotypingSettingsDict + + INSTALLED_APPS += ['django_autotyping'] + AUTOTYPING: AutotypingSettingsDict = { + "STUBS_GENERATION": { + "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings", + } + } # https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar) # Must delete archivebox/templates/admin to use because it relies on some things we override @@ -224,15 +210,15 @@ if DEBUG_REQUESTS_TRACKER: STATIC_URL = '/static/' STATICFILES_DIRS = [ - *([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'), + *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []), + str(Path(CONFIG.PACKAGE_DIR) / CONFIG.TEMPLATES_DIR_NAME / 'static'), ] TEMPLATE_DIRS = [ - *([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME), + *([str(CONFIG.CUSTOM_TEMPLATES_DIR)] if CONFIG.CUSTOM_TEMPLATES_DIR else []), + str(Path(CONFIG.PACKAGE_DIR) / CONFIG.TEMPLATES_DIR_NAME / 'core'), + str(Path(CONFIG.PACKAGE_DIR) / CONFIG.TEMPLATES_DIR_NAME / 'admin'), + str(Path(CONFIG.PACKAGE_DIR) / CONFIG.TEMPLATES_DIR_NAME), ] TEMPLATES = [ @@ -258,10 +244,10 @@ TEMPLATES = [ CACHE_DB_FILENAME = 'cache.sqlite3' -CACHE_DB_PATH = CACHE_DIR / CACHE_DB_FILENAME +CACHE_DB_PATH = CONFIG.CACHE_DIR / CACHE_DB_FILENAME CACHE_DB_TABLE = 'django_cache' -DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME +DATABASE_FILE = Path(CONFIG.OUTPUT_DIR) / CONFIG.SQL_INDEX_FILENAME DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE)) DATABASES = { @@ -272,7 +258,7 @@ DATABASES = { 'timeout': 60, 'check_same_thread': False, }, - 'TIME_ZONE': TIMEZONE, + 'TIME_ZONE': CONFIG.TIMEZONE, # DB setup is sometimes modified at runtime by setup_django() in config.py }, # 'cache': { @@ -282,7 +268,7 @@ DATABASES = { # 'timeout': 60, # 'check_same_thread': False, # }, - # 'TIME_ZONE': TIMEZONE, + # 'TIME_ZONE': CONFIG.TIMEZONE, # }, } MIGRATION_MODULES = {'signal_webhooks': None} @@ -312,7 +298,7 @@ STORAGES = { "BACKEND": "django.core.files.storage.FileSystemStorage", "OPTIONS": { "base_url": "/archive/", - "location": ARCHIVE_DIR, + "location": CONFIG.ARCHIVE_DIR, }, }, # "personas": { @@ -328,9 +314,9 @@ STORAGES = { ### Security Settings ################################################################################ -SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') +SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') -ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') +ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',') SECURE_BROWSER_XSS_FILTER = True SECURE_CONTENT_TYPE_NOSNIFF = True @@ -361,7 +347,7 @@ SHELL_PLUS_PRINT_SQL = False IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' if IS_SHELL: - os.environ['PYTHONSTARTUP'] = str(Path(PACKAGE_DIR) / 'core' / 'welcome_message.py') + os.environ['PYTHONSTARTUP'] = str(Path(CONFIG.PACKAGE_DIR) / 'core' / 'welcome_message.py') ################################################################################ @@ -373,10 +359,10 @@ USE_I18N = True USE_TZ = True DATETIME_FORMAT = 'Y-m-d g:iA' SHORT_DATETIME_FORMAT = 'Y-m-d h:iA' -TIME_ZONE = TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent +TIME_ZONE = CONFIG.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent -from django.conf.locale.en import formats as en_formats +from django.conf.locale.en import formats as en_formats # type: ignore en_formats.DATETIME_FORMAT = DATETIME_FORMAT en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT @@ -410,8 +396,8 @@ class NoisyRequestsFilter(logging.Filter): return 1 -if LOGS_DIR.exists(): - ERROR_LOG = (LOGS_DIR / 'errors.log') +if CONFIG.LOGS_DIR.exists(): + ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log') else: # historically too many edge cases here around creating log dir w/ correct permissions early on # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 04382c99..ab9bd275 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -46,7 +46,7 @@ urlpatterns = [ # path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django path('index.html', RedirectView.as_view(url='/')), - path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}), + path('index.json', static.serve, {'document_root': settings.CONFIG.OUTPUT_DIR, 'path': 'index.json'}), path('', HomepageView.as_view(), name='Home'), ] urlpatterns += staticfiles_urlpatterns() diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 31473b1a..b9b5c3a7 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -11,27 +11,18 @@ from ..util import ( domain, dedupe, ) -from ..config import ( - TIMEOUT, - SAVE_FAVICON, - FAVICON_PROVIDER, - CURL_BINARY, - CURL_ARGS, - CURL_EXTRA_ARGS, - CURL_VERSION, - CHECK_SSL_VALIDITY, - CURL_USER_AGENT, -) +from ..config import CONFIG from ..logging_util import TimedProgress @enforce_types -def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - out_dir = out_dir or Path(link.link_dir) +def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite: bool=False) -> bool: + assert link.link_dir + out_dir = Path(out_dir or link.link_dir) if not overwrite and (out_dir / 'favicon.ico').exists(): return False - return SAVE_FAVICON + return CONFIG.SAVE_FAVICON @enforce_types def get_output_path(): @@ -39,24 +30,26 @@ def get_output_path(): @enforce_types -def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult: """download site favicon from google's favicon api""" - out_dir = out_dir or link.link_dir + out_dir = Path(out_dir or link.link_dir) + assert out_dir.exists() + output: ArchiveOutput = 'favicon.ico' # later options take precedence options = [ - *CURL_ARGS, - *CURL_EXTRA_ARGS, + *CONFIG.CURL_ARGS, + *CONFIG.CURL_EXTRA_ARGS, '--max-time', str(timeout), '--output', str(output), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []), + *([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']), ] cmd = [ - CURL_BINARY, + CONFIG.CURL_BINARY, *dedupe(options), - FAVICON_PROVIDER.format(domain(link.url)), + CONFIG.FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' timer = TimedProgress(timeout, prefix=' ') @@ -72,7 +65,7 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CURL_VERSION, + cmd_version=CONFIG.CURL_VERSION, output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index 029e8022..3b8a4b9d 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -14,15 +14,7 @@ from ..util import ( without_query, without_fragment, ) -from ..config import ( - TIMEOUT, - SAVE_GIT, - GIT_BINARY, - GIT_ARGS, - GIT_VERSION, - GIT_DOMAINS, - CHECK_SSL_VALIDITY -) +from ..config import CONFIG from ..logging_util import TimedProgress @@ -50,17 +42,17 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona return False is_clonable_url = ( - (domain(link.url) in GIT_DOMAINS) + (domain(link.url) in CONFIG.GIT_DOMAINS) or (extension(link.url) == 'git') ) if not is_clonable_url: return False - return SAVE_GIT + return CONFIG.SAVE_GIT @enforce_types -def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult: """download full site using git""" out_dir = out_dir or Path(link.link_dir) @@ -68,10 +60,10 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> output_path = out_dir / output output_path.mkdir(exist_ok=True) cmd = [ - GIT_BINARY, + CONFIG.GIT_BINARY, 'clone', - *GIT_ARGS, - *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), + *CONFIG.GIT_ARGS, + *([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), without_query(without_fragment(link.url)), ] status = 'succeeded' @@ -96,7 +88,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=GIT_VERSION, + cmd_version=CONFIG.GIT_VERSION, output=output, status=status, **timer.stats, diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 5dfe4630..0f0d5b83 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -192,12 +192,12 @@ class Link: if extended: info.update({ 'snapshot_id': self.snapshot_id, - 'snapshot_uuid': self.snapshot_uuid, + 'snapshot_old_id': self.snapshot_old_id, 'snapshot_abid': self.snapshot_abid, 'link_dir': self.link_dir, 'archive_path': self.archive_path, - + 'hash': self.url_hash, 'base_url': self.base_url, 'scheme': self.scheme, @@ -206,7 +206,7 @@ class Link: 'basename': self.basename, 'extension': self.extension, 'is_static': self.is_static, - + 'tags_str': (self.tags or '').strip(','), # only used to render static index in index/html.py, remove if no longer needed there 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there @@ -266,15 +266,15 @@ class Link: @cached_property def snapshot(self): from core.models import Snapshot - return Snapshot.objects.only('id').get(url=self.url) + return Snapshot.objects.only('id', 'old_id', 'abid').get(url=self.url) @cached_property def snapshot_id(self): return str(self.snapshot.pk) @cached_property - def snapshot_uuid(self): - return str(self.snapshot.id) + def snapshot_old_id(self): + return str(self.snapshot.old_id) @cached_property def snapshot_abid(self): diff --git a/archivebox/manage.py b/archivebox/manage.py index 6e8c578a..195a0ec1 100755 --- a/archivebox/manage.py +++ b/archivebox/manage.py @@ -7,7 +7,9 @@ if __name__ == '__main__': # versions of ./manage.py commands whenever possible. When that's not possible # (e.g. makemigrations), you can comment out this check temporarily - if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv or 'squashmigrations' in sys.argv): + allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs'] + + if not any(cmd in sys.argv for cmd in allowed_commands): print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):") print() print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')