mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 14:44:18 +00:00
speed up the Snapshot handling view and show index page when extractor output is missing or multiple snapshots returned
This commit is contained in:
parent
0375853683
commit
b6d7c74680
1 changed files with 144 additions and 37 deletions
|
@ -4,8 +4,8 @@ from io import StringIO
|
||||||
from contextlib import redirect_stdout
|
from contextlib import redirect_stdout
|
||||||
|
|
||||||
from django.shortcuts import render, redirect
|
from django.shortcuts import render, redirect
|
||||||
|
from django.http import HttpResponse, Http404
|
||||||
from django.http import HttpResponse
|
from django.utils.html import format_html, mark_safe
|
||||||
from django.views import View, static
|
from django.views import View, static
|
||||||
from django.views.generic.list import ListView
|
from django.views.generic.list import ListView
|
||||||
from django.views.generic import FormView
|
from django.views.generic import FormView
|
||||||
|
@ -44,10 +44,6 @@ class SnapshotView(View):
|
||||||
# render static html index from filesystem archive/<timestamp>/index.html
|
# render static html index from filesystem archive/<timestamp>/index.html
|
||||||
|
|
||||||
def get(self, request, path):
|
def get(self, request, path):
|
||||||
# missing trailing slash -> redirect to index
|
|
||||||
if '/' not in path:
|
|
||||||
return redirect(f'{path}/index.html')
|
|
||||||
|
|
||||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||||
return redirect(f'/admin/login/?next={request.path}')
|
return redirect(f'/admin/login/?next={request.path}')
|
||||||
|
|
||||||
|
@ -56,41 +52,152 @@ class SnapshotView(View):
|
||||||
except (IndexError, ValueError):
|
except (IndexError, ValueError):
|
||||||
slug, archivefile = path.split('/', 1)[0], 'index.html'
|
slug, archivefile = path.split('/', 1)[0], 'index.html'
|
||||||
|
|
||||||
all_pages = list(Snapshot.objects.all())
|
|
||||||
|
|
||||||
# slug is a timestamp
|
# slug is a timestamp
|
||||||
by_ts = {page.timestamp: page for page in all_pages}
|
if slug.replace('.','').isdigit():
|
||||||
try:
|
|
||||||
# print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
|
|
||||||
response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
|
|
||||||
response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
|
|
||||||
return response
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# slug is a hash
|
# missing trailing slash -> redirect to index
|
||||||
by_hash = {page.url_hash: page for page in all_pages}
|
if '/' not in path:
|
||||||
try:
|
return redirect(f'{path}/index.html')
|
||||||
timestamp = by_hash[slug].timestamp
|
|
||||||
return redirect(f'/archive/{timestamp}/{archivefile}')
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
snapshot = Snapshot.objects.get(timestamp=slug)
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
if Snapshot.objects.filter(timestamp__startswith=slug).exists():
|
||||||
|
raise Snapshot.MultipleObjectsReturned
|
||||||
|
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
|
||||||
|
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
|
||||||
|
return response
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
# Snapshot does not exist
|
||||||
|
return HttpResponse(
|
||||||
|
format_html(
|
||||||
|
(
|
||||||
|
'<center><br/><br/><br/>'
|
||||||
|
'No Snapshots match the given timestamp: <code>{}</code><br/><br/>'
|
||||||
|
'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
|
||||||
|
'</center>'
|
||||||
|
),
|
||||||
|
slug,
|
||||||
|
path,
|
||||||
|
),
|
||||||
|
content_type="text/html",
|
||||||
|
status=404,
|
||||||
|
)
|
||||||
|
except Snapshot.MultipleObjectsReturned:
|
||||||
|
snapshot_hrefs = mark_safe('<br/>').join(
|
||||||
|
format_html(
|
||||||
|
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||||
|
snap.added.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
snap.timestamp,
|
||||||
|
snap.timestamp,
|
||||||
|
snap.url,
|
||||||
|
snap.title or '',
|
||||||
|
)
|
||||||
|
for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
|
||||||
|
)
|
||||||
|
return HttpResponse(
|
||||||
|
format_html(
|
||||||
|
(
|
||||||
|
'Multiple Snapshots match the given timestamp <code>{}</code><br/><pre>'
|
||||||
|
),
|
||||||
|
slug,
|
||||||
|
) + snapshot_hrefs + format_html(
|
||||||
|
(
|
||||||
|
'</pre><br/>'
|
||||||
|
'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
content_type="text/html",
|
||||||
|
status=404,
|
||||||
|
)
|
||||||
|
except Http404:
|
||||||
|
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
|
||||||
|
return HttpResponse(
|
||||||
|
format_html(
|
||||||
|
(
|
||||||
|
'<center><br/><br/><br/>'
|
||||||
|
'<a href="/archive/{}/index.html" target="_top">Snapshot <b><code>{}</code></b></a> exists but no file or folder <b><code>/{}</code></b> exists within.<br/><br/>'
|
||||||
|
'<small>Maybe this output type is not availabe for this URL,<br/>or the archiving process has not completed for this Snapshot yet?<br/>'
|
||||||
|
'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {}</code></pre></small><br/><br/>'
|
||||||
|
'You can go back to the <a href="/archive/{}/index.html" target="_top">Snapshot <b><code>{}</code></b></a> detail page, or return to the <a href="/" target="_top">Main Index</a>'
|
||||||
|
'</center>'
|
||||||
|
),
|
||||||
|
snapshot.timestamp,
|
||||||
|
snapshot.timestamp,
|
||||||
|
archivefile,
|
||||||
|
snapshot.timestamp,
|
||||||
|
snapshot.timestamp,
|
||||||
|
snapshot.timestamp,
|
||||||
|
),
|
||||||
|
content_type="text/html",
|
||||||
|
status=404,
|
||||||
|
)
|
||||||
# slug is a URL
|
# slug is a URL
|
||||||
by_url = {page.base_url: page for page in all_pages}
|
else:
|
||||||
try:
|
try:
|
||||||
# TODO: add multiple snapshot support by showing index of all snapshots
|
try:
|
||||||
# for given url instead of redirecting to timestamp index
|
# try exact match on full url first
|
||||||
timestamp = by_url[base_url(path)].timestamp
|
snapshot = Snapshot.objects.get(
|
||||||
return redirect(f'/archive/{timestamp}/index.html')
|
Q(url='http://' + path) | Q(url='https://' + path)
|
||||||
except KeyError:
|
)
|
||||||
pass
|
except Snapshot.DoesNotExist:
|
||||||
|
# fall back to match on exact base_url
|
||||||
return HttpResponse(
|
try:
|
||||||
'No archived link matches the given timestamp or hash.',
|
snapshot = Snapshot.objects.get(
|
||||||
content_type="text/plain",
|
Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
|
||||||
status=404,
|
)
|
||||||
)
|
except Snapshot.DoesNotExist:
|
||||||
|
# fall back to matching base_url as prefix
|
||||||
|
snapshot = Snapshot.objects.get(
|
||||||
|
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
|
||||||
|
)
|
||||||
|
return redirect(f'/archive/{snapshot.timestamp}/index.html')
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
return HttpResponse(
|
||||||
|
format_html(
|
||||||
|
(
|
||||||
|
'<center><br/><br/><br/>'
|
||||||
|
'No Snapshots match the given url: <code>{}</code><br/><br/>'
|
||||||
|
'You can <a href="/add/?url=https://{}" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
|
||||||
|
'</center>'
|
||||||
|
),
|
||||||
|
base_url(path),
|
||||||
|
path,
|
||||||
|
),
|
||||||
|
content_type="text/html",
|
||||||
|
status=404,
|
||||||
|
)
|
||||||
|
except Snapshot.MultipleObjectsReturned:
|
||||||
|
snapshot_hrefs = mark_safe('<br/>').join(
|
||||||
|
format_html(
|
||||||
|
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||||
|
snap.added.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
snap.timestamp,
|
||||||
|
snap.timestamp,
|
||||||
|
snap.url,
|
||||||
|
snap.title or '',
|
||||||
|
)
|
||||||
|
for snap in Snapshot.objects.filter(
|
||||||
|
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
|
||||||
|
).only('url', 'timestamp', 'title', 'added').order_by('-added')
|
||||||
|
)
|
||||||
|
return HttpResponse(
|
||||||
|
format_html(
|
||||||
|
(
|
||||||
|
'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
|
||||||
|
),
|
||||||
|
base_url(path),
|
||||||
|
) + snapshot_hrefs + format_html(
|
||||||
|
(
|
||||||
|
'</pre><br/>'
|
||||||
|
'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
content_type="text/html",
|
||||||
|
status=404,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PublicIndexView(ListView):
|
class PublicIndexView(ListView):
|
||||||
template_name = 'public_index.html'
|
template_name = 'public_index.html'
|
||||||
|
|
Loading…
Reference in a new issue