From c565fad75cf5f6256a0ce70febb7c2246cbd1b42 Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 12 Nov 2020 11:37:56 -0500 Subject: [PATCH] feat: Use prefetch related to reduce the number of queries to the database on public index view --- archivebox/core/utils.py | 83 +++++++++++++--------------------------- archivebox/core/views.py | 1 + 2 files changed, 27 insertions(+), 57 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 6266024b..14c40eaf 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -1,15 +1,16 @@ from django.utils.html import format_html from core.models import Snapshot, EXTRACTORS +from core.settings import DEBUG from pathlib import Path def get_icons(snapshot: Snapshot) -> str: - archive_results = snapshot.archiveresult_set + archive_results = list(snapshot.archiveresult_set.all()) link = snapshot.as_link() canon = link.canonical_outputs() output = "" - output_template = '{} ' + output_template = '{} ' icons = { "singlefile": "❶", "wget": "🆆", @@ -27,62 +28,30 @@ def get_icons(snapshot: Snapshot) -> str: # Missing specific entry for WARC for extractor, _ in EXTRACTORS: - result = archive_results.filter(extractor=extractor, status="succeeded") - path, exists = link.archive_path, result.exists() - try: - if extractor not in exclude: - output += output_template.format(path, canon[f"{extractor}_path"], - exists, extractor, icons.get(extractor, "?")) - if extractor == "wget": - # warc isn't technically it's own extractor, so we have to add it after wget - exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - if exists: - output += output_template.format(exists[0], "", - True, "warc", icons.get("warc", "?")) + for result in archive_results: + if result.extractor != extractor or result.status != "succeeded": + continue + path = link.archive_path + try: + if extractor not in exclude: + output += output_template.format(path, canon[f"{extractor}_path"], + extractor, icons.get(extractor, "?")) + if extractor == "wget": + # warc isn't technically it's own extractor, so we have to add it after wget + exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + if exists: + output += output_template.format(exists[0], "", + "warc", icons.get("warc", "?")) - if extractor == "archive_org" and exists: - # The check for archive_org is different, so it has to be handled separately - target_path = Path(path) / "archive.org.txt" - exists = target_path.exists() - if exists: - output += '{} '.format(canon["archive_org_path"], - True, "archive_org", icons.get("archive_org", "?")) + if extractor == "archive_org": + # The check for archive_org is different, so it has to be handled separately + target_path = Path(path) / "archive.org.txt" + exists = target_path.exists() + if exists: + output += '{} '.format(canon["archive_org_path"], + "archive_org", icons.get("archive_org", "?")) - except Exception as e: - print(e) + except Exception as e: + print(e) return format_html(f'{output}') - -#def get_icons(snapshot: Snapshot) -> str: -# link = snapshot.as_link() -# canon = link.canonical_outputs() -# out_dir = Path(link.link_dir) -# -# # slow version: highlights icons based on whether files exist or not for that output -# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) -# # fast version: all icons are highlighted without checking for outputs in filesystem -# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) -# -# return format_html( -# '' -# '' -# '🆆 ' -# '🅷 ' -# '📄 ' -# '💻 ' -# '📦 ' -# '📼 ' -# '🅶 ' -# '🏛 ' -# '', -# *link_tuple(link, 'singlefile_path'), -# *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), -# *link_tuple(link, 'pdf_path'), -# *link_tuple(link, 'screenshot_path'), -# *link_tuple(link, 'dom_path'), -# *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), -# *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), -# *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), -# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), -# ) -# diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 7cd8b104..ee540821 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -98,6 +98,7 @@ class PublicArchiveView(ListView): query = self.request.GET.get('q') if query: qs = Snapshot.objects.filter(title__icontains=query) + qs = qs.prefetch_related("archiveresult_set").all() for snapshot in qs: snapshot.icons = get_icons(snapshot) return qs