mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 13:28:29 +00:00
massively improve Snapshot admin list view query performance
This commit is contained in:
parent
6c4f3fc83a
commit
24fe958ff3
5 changed files with 194 additions and 39 deletions
|
@ -103,7 +103,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
|
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
|
||||||
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
|
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
|
||||||
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
|
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
|
||||||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 100},
|
||||||
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
||||||
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
||||||
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
||||||
|
|
|
@ -10,12 +10,15 @@ from datetime import datetime, timezone
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.db.models import Count, Q
|
from django.db.models import Count, Q, Prefetch
|
||||||
from django.urls import path, reverse
|
from django.urls import path, reverse, resolve
|
||||||
|
from django.utils import timezone
|
||||||
|
from django.utils.functional import cached_property
|
||||||
from django.utils.html import format_html
|
from django.utils.html import format_html
|
||||||
from django.utils.safestring import mark_safe
|
from django.utils.safestring import mark_safe
|
||||||
from django.shortcuts import render, redirect
|
from django.shortcuts import render, redirect
|
||||||
from django.contrib.auth import get_user_model
|
from django.contrib.auth import get_user_model
|
||||||
|
from django.core.paginator import Paginator
|
||||||
from django.core.exceptions import ValidationError
|
from django.core.exceptions import ValidationError
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django import forms
|
from django import forms
|
||||||
|
@ -126,22 +129,99 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
|
||||||
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
|
||||||
|
|
||||||
|
class AccelleratedPaginator(Paginator):
|
||||||
|
"""
|
||||||
|
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
|
||||||
|
Speeds up SELECT Count(*) on Admin views by >20x.
|
||||||
|
https://hakibenita.com/optimizing-the-django-admin-paginator
|
||||||
|
"""
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def count(self):
|
||||||
|
if self.object_list._has_filters():
|
||||||
|
# fallback to normal count method on filtered queryset
|
||||||
|
return super().count
|
||||||
|
else:
|
||||||
|
# otherwise count total rows in a separate fast query
|
||||||
|
return self.object_list.model.objects.count()
|
||||||
|
|
||||||
|
# Alternative approach for PostgreSQL: fallback count takes > 200ms
|
||||||
|
# from django.db import connection, transaction, OperationalError
|
||||||
|
# with transaction.atomic(), connection.cursor() as cursor:
|
||||||
|
# cursor.execute('SET LOCAL statement_timeout TO 200;')
|
||||||
|
# try:
|
||||||
|
# return super().count
|
||||||
|
# except OperationalError:
|
||||||
|
# return 9999999999999
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResultInline(admin.TabularInline):
|
class ArchiveResultInline(admin.TabularInline):
|
||||||
name = 'Archive Results Log'
|
name = 'Archive Results Log'
|
||||||
model = ArchiveResult
|
model = ArchiveResult
|
||||||
|
parent_model = Snapshot
|
||||||
# fk_name = 'snapshot'
|
# fk_name = 'snapshot'
|
||||||
extra = 1
|
extra = 0
|
||||||
readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version')
|
sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
|
||||||
fields = ('id', *readonly_fields, 'status', 'output')
|
readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version')
|
||||||
|
fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
|
||||||
|
# exclude = ('id',)
|
||||||
|
ordering = ('end_ts',)
|
||||||
show_change_link = True
|
show_change_link = True
|
||||||
# # classes = ['collapse']
|
# # classes = ['collapse']
|
||||||
# # list_display_links = ['abid']
|
# # list_display_links = ['abid']
|
||||||
|
|
||||||
|
def get_parent_object_from_request(self, request):
|
||||||
|
resolved = resolve(request.path_info)
|
||||||
|
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
|
||||||
|
|
||||||
|
@admin.display(
|
||||||
|
description='Completed',
|
||||||
|
ordering='end_ts',
|
||||||
|
)
|
||||||
|
def completed(self, obj):
|
||||||
|
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
|
||||||
|
|
||||||
def result_id(self, obj):
|
def result_id(self, obj):
|
||||||
return format_html('<a href="{}"><small><code>[{}]</code></small></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
|
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
|
||||||
|
|
||||||
def command(self, obj):
|
def command(self, obj):
|
||||||
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
|
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
|
||||||
|
|
||||||
|
def version(self, obj):
|
||||||
|
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
|
||||||
|
|
||||||
|
def get_formset(self, request, obj=None, **kwargs):
|
||||||
|
formset = super().get_formset(request, obj, **kwargs)
|
||||||
|
snapshot = self.get_parent_object_from_request(request)
|
||||||
|
|
||||||
|
# import ipdb; ipdb.set_trace()
|
||||||
|
formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
|
||||||
|
|
||||||
|
# default values for new entries
|
||||||
|
formset.form.base_fields['status'].initial = 'succeeded'
|
||||||
|
formset.form.base_fields['start_ts'].initial = timezone.now()
|
||||||
|
formset.form.base_fields['end_ts'].initial = timezone.now()
|
||||||
|
formset.form.base_fields['cmd_version'].initial = '-'
|
||||||
|
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
|
||||||
|
formset.form.base_fields['created_by'].initial = request.user
|
||||||
|
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
|
||||||
|
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
|
||||||
|
|
||||||
|
if obj is not None:
|
||||||
|
# hidden values for existing entries and new entries
|
||||||
|
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
|
||||||
|
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
|
||||||
|
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
|
||||||
|
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
|
||||||
|
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
|
||||||
|
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
|
||||||
|
return formset
|
||||||
|
|
||||||
|
def get_readonly_fields(self, request, obj=None):
|
||||||
|
if obj is not None:
|
||||||
|
return self.readonly_fields
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
class TagInline(admin.TabularInline):
|
class TagInline(admin.TabularInline):
|
||||||
|
@ -222,25 +302,22 @@ def get_abid_info(self, obj):
|
||||||
|
|
||||||
@admin.register(Snapshot, site=archivebox_admin)
|
@admin.register(Snapshot, site=archivebox_admin)
|
||||||
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
||||||
class Meta:
|
|
||||||
model = Snapshot
|
|
||||||
|
|
||||||
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
|
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
|
||||||
# list_editable = ('title',)
|
|
||||||
sort_fields = ('title_str', 'url_str', 'added', 'files')
|
sort_fields = ('title_str', 'url_str', 'added', 'files')
|
||||||
readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
|
readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
|
||||||
search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
|
search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
|
||||||
list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags')
|
list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name')
|
||||||
fields = ('url', 'created_by', 'title', *readonly_fields)
|
fields = ('url', 'created_by', 'title', *readonly_fields)
|
||||||
ordering = ['-added']
|
ordering = ['-added']
|
||||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||||
autocomplete_fields = ['tags']
|
|
||||||
inlines = [TagInline, ArchiveResultInline]
|
inlines = [TagInline, ArchiveResultInline]
|
||||||
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
|
list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
|
||||||
|
|
||||||
action_form = SnapshotActionForm
|
action_form = SnapshotActionForm
|
||||||
|
paginator = AccelleratedPaginator
|
||||||
|
|
||||||
save_on_top = True
|
save_on_top = True
|
||||||
|
show_full_result_count = False
|
||||||
|
|
||||||
def changelist_view(self, request, extra_context=None):
|
def changelist_view(self, request, extra_context=None):
|
||||||
extra_context = extra_context or {}
|
extra_context = extra_context or {}
|
||||||
|
@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
||||||
]
|
]
|
||||||
return custom_urls + urls
|
return custom_urls + urls
|
||||||
|
|
||||||
def get_queryset(self, request):
|
# def get_queryset(self, request):
|
||||||
self.request = request
|
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
|
||||||
return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult'))
|
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
|
||||||
|
|
||||||
|
# self.request = request
|
||||||
|
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
|
||||||
|
|
||||||
def tag_list(self, obj):
|
def tag_list(self, obj):
|
||||||
return ', '.join(obj.tags.values_list('name', flat=True))
|
return ', '.join(tag.name for tag in obj.tags.all())
|
||||||
|
|
||||||
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
|
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
|
||||||
# def action(self, obj):
|
# def action(self, obj):
|
||||||
|
@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
||||||
ordering='title',
|
ordering='title',
|
||||||
)
|
)
|
||||||
def title_str(self, obj):
|
def title_str(self, obj):
|
||||||
canon = obj.as_link().canonical_outputs()
|
|
||||||
tags = ''.join(
|
tags = ''.join(
|
||||||
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
|
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
|
||||||
for tag in obj.tags.all()
|
for tag in obj.tags.all()
|
||||||
if str(tag).strip()
|
if str(tag.name).strip()
|
||||||
)
|
)
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="/{}">'
|
'<a href="/{}">'
|
||||||
'<img src="/{}/{}" class="favicon" onerror="this.remove()">'
|
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
|
||||||
'</a>'
|
'</a>'
|
||||||
'<a href="/{}/index.html">'
|
'<a href="/{}/index.html">'
|
||||||
'<b class="status-{}">{}</b>'
|
'<b class="status-{}">{}</b>'
|
||||||
'</a>',
|
'</a>',
|
||||||
obj.archive_path,
|
obj.archive_path,
|
||||||
obj.archive_path, canon['favicon_path'],
|
obj.archive_path,
|
||||||
obj.archive_path,
|
obj.archive_path,
|
||||||
'fetched' if obj.latest_title or obj.title else 'pending',
|
'fetched' if obj.latest_title or obj.title else 'pending',
|
||||||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
||||||
|
@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
||||||
|
|
||||||
@admin.display(
|
@admin.display(
|
||||||
description='Files Saved',
|
description='Files Saved',
|
||||||
ordering='archiveresult_count',
|
# ordering='archiveresult_count',
|
||||||
)
|
)
|
||||||
def files(self, obj):
|
def files(self, obj):
|
||||||
return snapshot_icons(obj)
|
return snapshot_icons(obj)
|
||||||
|
|
||||||
|
|
||||||
@admin.display(
|
@admin.display(
|
||||||
ordering='archiveresult_count'
|
# ordering='archiveresult_count'
|
||||||
)
|
)
|
||||||
def size(self, obj):
|
def size(self, obj):
|
||||||
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
|
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
|
||||||
|
@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin):
|
||||||
actions = ['delete_selected']
|
actions = ['delete_selected']
|
||||||
ordering = ['-created']
|
ordering = ['-created']
|
||||||
|
|
||||||
|
paginator = AccelleratedPaginator
|
||||||
|
|
||||||
def API(self, obj):
|
def API(self, obj):
|
||||||
try:
|
try:
|
||||||
return get_abid_info(self, obj)
|
return get_abid_info(self, obj)
|
||||||
|
@ -574,6 +655,8 @@ class ArchiveResultAdmin(ABIDModelAdmin):
|
||||||
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
||||||
ordering = ['-start_ts']
|
ordering = ['-start_ts']
|
||||||
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
|
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
|
||||||
|
|
||||||
|
paginator = AccelleratedPaginator
|
||||||
|
|
||||||
@admin.display(
|
@admin.display(
|
||||||
description='Snapshot Info'
|
description='Snapshot Info'
|
||||||
|
|
|
@ -125,6 +125,12 @@ class SnapshotTag(models.Model):
|
||||||
db_table = 'core_snapshot_tags'
|
db_table = 'core_snapshot_tags'
|
||||||
unique_together = [('snapshot', 'tag')]
|
unique_together = [('snapshot', 'tag')]
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotManager(models.Manager):
|
||||||
|
def get_queryset(self):
|
||||||
|
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
|
||||||
|
|
||||||
|
|
||||||
class Snapshot(ABIDModel):
|
class Snapshot(ABIDModel):
|
||||||
abid_prefix = 'snp_'
|
abid_prefix = 'snp_'
|
||||||
abid_ts_src = 'self.added'
|
abid_ts_src = 'self.added'
|
||||||
|
@ -150,6 +156,8 @@ class Snapshot(ABIDModel):
|
||||||
|
|
||||||
archiveresult_set: models.Manager['ArchiveResult']
|
archiveresult_set: models.Manager['ArchiveResult']
|
||||||
|
|
||||||
|
objects = SnapshotManager()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def uuid(self):
|
def uuid(self):
|
||||||
return self.id
|
return self.id
|
||||||
|
@ -177,8 +185,7 @@ class Snapshot(ABIDModel):
|
||||||
def as_json(self, *args) -> dict:
|
def as_json(self, *args) -> dict:
|
||||||
args = args or self.keys
|
args = args or self.keys
|
||||||
return {
|
return {
|
||||||
key: getattr(self, key)
|
key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
|
||||||
if key != 'tags' else self.tags_str()
|
|
||||||
for key in args
|
for key in args
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -190,8 +197,14 @@ class Snapshot(ABIDModel):
|
||||||
return load_link_details(self.as_link())
|
return load_link_details(self.as_link())
|
||||||
|
|
||||||
def tags_str(self, nocache=True) -> str | None:
|
def tags_str(self, nocache=True) -> str | None:
|
||||||
|
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
|
||||||
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
|
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
|
||||||
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
|
||||||
|
if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
|
||||||
|
# tags are pre-fetched already, use them directly (best because db is always freshest)
|
||||||
|
tags_str = calc_tags_str()
|
||||||
|
return tags_str
|
||||||
|
|
||||||
if nocache:
|
if nocache:
|
||||||
tags_str = calc_tags_str()
|
tags_str = calc_tags_str()
|
||||||
cache.set(cache_key, tags_str)
|
cache.set(cache_key, tags_str)
|
||||||
|
@ -234,7 +247,10 @@ class Snapshot(ABIDModel):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def num_outputs(self) -> int:
|
def num_outputs(self) -> int:
|
||||||
return self.archiveresult_set.filter(status='succeeded').count()
|
# DONT DO THIS: it will trigger a separate query for every snapshot
|
||||||
|
# return self.archiveresult_set.filter(status='succeeded').count()
|
||||||
|
# this is better:
|
||||||
|
return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def base_url(self):
|
def base_url(self):
|
||||||
|
@ -262,10 +278,21 @@ class Snapshot(ABIDModel):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def thumbnail_url(self) -> Optional[str]:
|
def thumbnail_url(self) -> Optional[str]:
|
||||||
result = self.archiveresult_set.filter(
|
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||||
extractor='screenshot',
|
result = (sorted(
|
||||||
status='succeeded'
|
(
|
||||||
).only('output').last()
|
result
|
||||||
|
for result in self.archiveresult_set.all()
|
||||||
|
if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
|
||||||
|
),
|
||||||
|
key=lambda result: result.created,
|
||||||
|
) or [None])[-1]
|
||||||
|
else:
|
||||||
|
result = self.archiveresult_set.filter(
|
||||||
|
extractor='screenshot',
|
||||||
|
status='succeeded'
|
||||||
|
).only('output').last()
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
|
return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
|
||||||
return None
|
return None
|
||||||
|
@ -292,6 +319,21 @@ class Snapshot(ABIDModel):
|
||||||
if self.title:
|
if self.title:
|
||||||
return self.title # whoopdedoo that was easy
|
return self.title # whoopdedoo that was easy
|
||||||
|
|
||||||
|
# check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
|
||||||
|
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||||
|
try:
|
||||||
|
return (sorted(
|
||||||
|
(
|
||||||
|
result.output.strip()
|
||||||
|
for result in self.archiveresult_set.all()
|
||||||
|
if result.extractor == 'title' and result.status =='succeeded' and result.output
|
||||||
|
),
|
||||||
|
key=lambda title: len(title),
|
||||||
|
) or [None])[-1]
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# take longest successful title from ArchiveResult db history
|
# take longest successful title from ArchiveResult db history
|
||||||
return sorted(
|
return sorted(
|
||||||
|
@ -355,12 +397,23 @@ class Snapshot(ABIDModel):
|
||||||
|
|
||||||
class ArchiveResultManager(models.Manager):
|
class ArchiveResultManager(models.Manager):
|
||||||
def indexable(self, sorted: bool = True):
|
def indexable(self, sorted: bool = True):
|
||||||
|
"""Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
|
||||||
|
|
||||||
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||||
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
|
||||||
|
|
||||||
if sorted:
|
if sorted:
|
||||||
precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
precedence = [
|
||||||
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
|
When(extractor=method, then=Value(precedence))
|
||||||
|
for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||||
|
]
|
||||||
|
qs = qs.annotate(
|
||||||
|
indexing_precedence=Case(
|
||||||
|
*precedence,
|
||||||
|
default=Value(1000),
|
||||||
|
output_field=IntegerField()
|
||||||
|
)
|
||||||
|
).order_by('indexing_precedence')
|
||||||
return qs
|
return qs
|
||||||
|
|
||||||
class ArchiveResult(ABIDModel):
|
class ArchiveResult(ABIDModel):
|
||||||
|
|
|
@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def wget_output_path(link: Link) -> Optional[str]:
|
def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
|
||||||
"""calculate the path to the wgetted .html file, since wget may
|
"""calculate the path to the wgetted .html file, since wget may
|
||||||
adjust some paths to be different than the base_url path.
|
adjust some paths to be different than the base_url path.
|
||||||
|
|
||||||
|
@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||||
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
|
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
|
cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
|
||||||
|
|
||||||
|
if not nocache:
|
||||||
|
from django.core.cache import cache
|
||||||
|
cached_result = cache.get(cache_key)
|
||||||
|
if cached_result:
|
||||||
|
return cached_result
|
||||||
|
|
||||||
|
|
||||||
# There's also lots of complexity around how the urlencoding and renaming
|
# There's also lots of complexity around how the urlencoding and renaming
|
||||||
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
|
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
|
||||||
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
|
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
|
||||||
|
@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
output_path = None
|
output_path = None
|
||||||
|
|
||||||
if output_path:
|
if output_path:
|
||||||
|
if not nocache:
|
||||||
|
cache.set(cache_key, output_path)
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
# fallback to just the domain dir
|
# fallback to just the domain dir
|
||||||
|
|
|
@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str:
|
||||||
from core.models import ArchiveResult
|
from core.models import ArchiveResult
|
||||||
# start = datetime.now(timezone.utc)
|
# start = datetime.now(timezone.utc)
|
||||||
|
|
||||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache:
|
||||||
|
archive_results = [
|
||||||
|
result
|
||||||
|
for result in snapshot.archiveresult_set.all()
|
||||||
|
if result.status == "succeeded" and result.output
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||||
|
|
||||||
link = snapshot.as_link()
|
link = snapshot.as_link()
|
||||||
path = link.archive_path
|
path = link.archive_path
|
||||||
canon = link.canonical_outputs()
|
canon = link.canonical_outputs()
|
||||||
|
|
Loading…
Add table
Reference in a new issue