From 24fe958ff35a6b5d815db571bb452541defe1111 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 26 Aug 2024 20:16:43 -0700 Subject: [PATCH] massively improve Snapshot admin list view query performance --- archivebox/config.py | 2 +- archivebox/core/admin.py | 133 +++++++++++++++++++++++++++------- archivebox/core/models.py | 75 ++++++++++++++++--- archivebox/extractors/wget.py | 13 +++- archivebox/index/html.py | 10 ++- 5 files changed, 194 insertions(+), 39 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 8fcc5352..8d4a0695 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -103,7 +103,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, - 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 100}, 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, 'TIME_ZONE': {'type': str, 'default': 'UTC'}, 'TIMEZONE': {'type': str, 'default': 'UTC'}, diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index b87f6874..d8b3854d 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -10,12 +10,15 @@ from datetime import datetime, timezone from typing import Dict, Any from django.contrib import admin -from django.db.models import Count, Q -from django.urls import path, reverse +from django.db.models import Count, Q, Prefetch +from django.urls import path, reverse, resolve +from django.utils import timezone +from django.utils.functional import cached_property from django.utils.html import format_html from django.utils.safestring import mark_safe from django.shortcuts import render, redirect from django.contrib.auth import get_user_model +from django.core.paginator import Paginator from django.core.exceptions import ValidationError from django.conf import settings from django import forms @@ -126,22 +129,99 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin) +class AccelleratedPaginator(Paginator): + """ + Accellerated Pagniator ignores DISTINCT when counting total number of rows. + Speeds up SELECT Count(*) on Admin views by >20x. + https://hakibenita.com/optimizing-the-django-admin-paginator + """ + + @cached_property + def count(self): + if self.object_list._has_filters(): + # fallback to normal count method on filtered queryset + return super().count + else: + # otherwise count total rows in a separate fast query + return self.object_list.model.objects.count() + + # Alternative approach for PostgreSQL: fallback count takes > 200ms + # from django.db import connection, transaction, OperationalError + # with transaction.atomic(), connection.cursor() as cursor: + # cursor.execute('SET LOCAL statement_timeout TO 200;') + # try: + # return super().count + # except OperationalError: + # return 9999999999999 + + class ArchiveResultInline(admin.TabularInline): name = 'Archive Results Log' model = ArchiveResult + parent_model = Snapshot # fk_name = 'snapshot' - extra = 1 - readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version') - fields = ('id', *readonly_fields, 'status', 'output') + extra = 0 + sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version') + readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version') + fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output') + # exclude = ('id',) + ordering = ('end_ts',) show_change_link = True # # classes = ['collapse'] # # list_display_links = ['abid'] + def get_parent_object_from_request(self, request): + resolved = resolve(request.path_info) + return self.parent_model.objects.get(pk=resolved.kwargs['object_id']) + + @admin.display( + description='Completed', + ordering='end_ts', + ) + def completed(self, obj): + return format_html('

{}

', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S')) + def result_id(self, obj): - return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) + return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) def command(self, obj): return format_html('{}', " ".join(obj.cmd or [])) + + def version(self, obj): + return format_html('{}', obj.cmd_version or '-') + + def get_formset(self, request, obj=None, **kwargs): + formset = super().get_formset(request, obj, **kwargs) + snapshot = self.get_parent_object_from_request(request) + + # import ipdb; ipdb.set_trace() + formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() + + # default values for new entries + formset.form.base_fields['status'].initial = 'succeeded' + formset.form.base_fields['start_ts'].initial = timezone.now() + formset.form.base_fields['end_ts'].initial = timezone.now() + formset.form.base_fields['cmd_version'].initial = '-' + formset.form.base_fields['pwd'].initial = str(snapshot.link_dir) + formset.form.base_fields['created_by'].initial = request.user + formset.form.base_fields['cmd'] = forms.JSONField(initial=['-']) + formset.form.base_fields['output'].initial = 'Manually recorded cmd output...' + + if obj is not None: + # hidden values for existing entries and new entries + formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget() + formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget() + formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget() + formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget() + formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget() + formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget() + return formset + + def get_readonly_fields(self, request, obj=None): + if obj is not None: + return self.readonly_fields + else: + return [] class TagInline(admin.TabularInline): @@ -222,25 +302,22 @@ def get_abid_info(self, obj): @admin.register(Snapshot, site=archivebox_admin) class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): - class Meta: - model = Snapshot - list_display = ('added', 'title_str', 'files', 'size', 'url_str') - # list_editable = ('title',) sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') + readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') - list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags') + list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name') fields = ('url', 'created_by', 'title', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - autocomplete_fields = ['tags'] inlines = [TagInline, ArchiveResultInline] - list_per_page = CONFIG.SNAPSHOTS_PER_PAGE + list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000) action_form = SnapshotActionForm + paginator = AccelleratedPaginator save_on_top = True + show_full_result_count = False def changelist_view(self, request, extra_context=None): extra_context = extra_context or {} @@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): ] return custom_urls + urls - def get_queryset(self, request): - self.request = request - return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult')) + # def get_queryset(self, request): + # # tags_qs = SnapshotTag.objects.all().select_related('tag') + # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs) + + # self.request = request + # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) def tag_list(self, obj): - return ', '.join(obj.tags.values_list('name', flat=True)) + return ', '.join(tag.name for tag in obj.tags.all()) # TODO: figure out a different way to do this, you cant nest forms so this doenst work # def action(self, obj): @@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): ordering='title', ) def title_str(self, obj): - canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html('{} ', tag.id, tag) + format_html('{} ', tag.pk, tag.name) for tag in obj.tags.all() - if str(tag).strip() + if str(tag.name).strip() ) return format_html( '' - '' + '' '' '' '{}' '', obj.archive_path, - obj.archive_path, canon['favicon_path'], + obj.archive_path, obj.archive_path, 'fetched' if obj.latest_title or obj.title else 'pending', urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' @@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): @admin.display( description='Files Saved', - ordering='archiveresult_count', + # ordering='archiveresult_count', ) def files(self, obj): return snapshot_icons(obj) @admin.display( - ordering='archiveresult_count' + # ordering='archiveresult_count' ) def size(self, obj): archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size @@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin): actions = ['delete_selected'] ordering = ['-created'] + paginator = AccelleratedPaginator + def API(self, obj): try: return get_abid_info(self, obj) @@ -574,6 +655,8 @@ class ArchiveResultAdmin(ABIDModelAdmin): list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') ordering = ['-start_ts'] list_per_page = CONFIG.SNAPSHOTS_PER_PAGE + + paginator = AccelleratedPaginator @admin.display( description='Snapshot Info' diff --git a/archivebox/core/models.py b/archivebox/core/models.py index c9266bd9..bed9e65a 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -125,6 +125,12 @@ class SnapshotTag(models.Model): db_table = 'core_snapshot_tags' unique_together = [('snapshot', 'tag')] + +class SnapshotManager(models.Manager): + def get_queryset(self): + return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct() + + class Snapshot(ABIDModel): abid_prefix = 'snp_' abid_ts_src = 'self.added' @@ -150,6 +156,8 @@ class Snapshot(ABIDModel): archiveresult_set: models.Manager['ArchiveResult'] + objects = SnapshotManager() + @property def uuid(self): return self.id @@ -177,8 +185,7 @@ class Snapshot(ABIDModel): def as_json(self, *args) -> dict: args = args or self.keys return { - key: getattr(self, key) - if key != 'tags' else self.tags_str() + key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False) for key in args } @@ -190,8 +197,14 @@ class Snapshot(ABIDModel): return load_link_details(self.as_link()) def tags_str(self, nocache=True) -> str | None: + calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all())) cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags' - calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) + + if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache: + # tags are pre-fetched already, use them directly (best because db is always freshest) + tags_str = calc_tags_str() + return tags_str + if nocache: tags_str = calc_tags_str() cache.set(cache_key, tags_str) @@ -234,7 +247,10 @@ class Snapshot(ABIDModel): @cached_property def num_outputs(self) -> int: - return self.archiveresult_set.filter(status='succeeded').count() + # DONT DO THIS: it will trigger a separate query for every snapshot + # return self.archiveresult_set.filter(status='succeeded').count() + # this is better: + return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded')) @cached_property def base_url(self): @@ -262,10 +278,21 @@ class Snapshot(ABIDModel): @cached_property def thumbnail_url(self) -> Optional[str]: - result = self.archiveresult_set.filter( - extractor='screenshot', - status='succeeded' - ).only('output').last() + if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: + result = (sorted( + ( + result + for result in self.archiveresult_set.all() + if result.extractor == 'screenshot' and result.status =='succeeded' and result.output + ), + key=lambda result: result.created, + ) or [None])[-1] + else: + result = self.archiveresult_set.filter( + extractor='screenshot', + status='succeeded' + ).only('output').last() + if result: return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}']) return None @@ -292,6 +319,21 @@ class Snapshot(ABIDModel): if self.title: return self.title # whoopdedoo that was easy + # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again + if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: + try: + return (sorted( + ( + result.output.strip() + for result in self.archiveresult_set.all() + if result.extractor == 'title' and result.status =='succeeded' and result.output + ), + key=lambda title: len(title), + ) or [None])[-1] + except IndexError: + pass + + try: # take longest successful title from ArchiveResult db history return sorted( @@ -355,12 +397,23 @@ class Snapshot(ABIDModel): class ArchiveResultManager(models.Manager): def indexable(self, sorted: bool = True): + """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)""" + INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] - qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') + qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded') if sorted: - precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] - qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') + precedence = [ + When(extractor=method, then=Value(precedence)) + for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE + ] + qs = qs.annotate( + indexing_precedence=Case( + *precedence, + default=Value(1000), + output_field=IntegerField() + ) + ).order_by('indexing_precedence') return qs class ArchiveResult(ABIDModel): diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index cd72be4e..c97b2f28 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]: @enforce_types -def wget_output_path(link: Link) -> Optional[str]: +def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]: """calculate the path to the wgetted .html file, since wget may adjust some paths to be different than the base_url path. @@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]: # https://example.com/abc/test/?v=zzVa_tX1OiI # > example.com/abc/test/index.html@v=zzVa_tX1OiI.html + cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path' + + if not nocache: + from django.core.cache import cache + cached_result = cache.get(cache_key) + if cached_result: + return cached_result + + # There's also lots of complexity around how the urlencoding and renaming # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc, # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than @@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]: output_path = None if output_path: + if not nocache: + cache.set(cache_key, output_path) return output_path # fallback to just the domain dir diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 339f9429..2e5d18bc 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str: from core.models import ArchiveResult # start = datetime.now(timezone.utc) - archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) + if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache: + archive_results = [ + result + for result in snapshot.archiveresult_set.all() + if result.status == "succeeded" and result.output + ] + else: + archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) + link = snapshot.as_link() path = link.archive_path canon = link.canonical_outputs()