2019-05-01 03:13:21 +00:00
__package__ = ' archivebox.core '
2019-04-02 20:36:41 +00:00
2020-08-25 18:15:42 +00:00
from io import StringIO
from contextlib import redirect_stdout
2019-05-01 03:13:21 +00:00
from django . shortcuts import render , redirect
2021-02-16 01:52:08 +00:00
from django . http import HttpResponse , Http404
from django . utils . html import format_html , mark_safe
2019-05-01 03:13:21 +00:00
from django . views import View , static
2020-08-20 14:04:34 +00:00
from django . views . generic . list import ListView
2020-08-28 14:58:32 +00:00
from django . views . generic import FormView
2021-01-29 14:08:03 +00:00
from django . db . models import Q
2020-08-28 14:58:32 +00:00
from django . contrib . auth . mixins import UserPassesTestMixin
2021-07-02 00:55:51 +00:00
from django . views . decorators . csrf import csrf_exempt
from django . utils . decorators import method_decorator
2020-08-20 20:43:28 +00:00
2019-05-01 03:44:51 +00:00
from core . models import Snapshot
2020-08-25 18:15:42 +00:00
from core . forms import AddLinkForm
2019-05-02 23:15:16 +00:00
from . . config import (
OUTPUT_DIR ,
PUBLIC_INDEX ,
PUBLIC_SNAPSHOTS ,
2020-11-28 06:29:34 +00:00
PUBLIC_ADD_VIEW ,
VERSION ,
FOOTER_INFO ,
2021-02-16 01:42:00 +00:00
SNAPSHOTS_PER_PAGE ,
2019-05-02 23:15:16 +00:00
)
2021-04-10 09:13:56 +00:00
from . . main import add
2020-08-25 18:15:42 +00:00
from . . util import base_url , ansi_to_html
2021-04-10 09:13:56 +00:00
from . . search import query_search_index
2020-07-02 20:54:25 +00:00
2019-04-17 09:42:21 +00:00
2021-01-30 10:35:07 +00:00
class HomepageView ( View ) :
2019-04-17 09:42:21 +00:00
def get ( self , request ) :
2020-07-28 03:56:35 +00:00
if request . user . is_authenticated :
return redirect ( ' /admin/core/snapshot/ ' )
if PUBLIC_INDEX :
2021-01-30 10:35:07 +00:00
return redirect ( ' /public ' )
2021-10-03 17:12:03 +00:00
2020-07-28 03:56:35 +00:00
return redirect ( f ' /admin/login/?next= { request . path } ' )
2019-04-22 23:08:01 +00:00
2021-01-30 10:35:07 +00:00
class SnapshotView ( View ) :
# render static html index from filesystem archive/<timestamp>/index.html
2019-05-01 03:13:21 +00:00
def get ( self , request , path ) :
2019-05-02 23:15:16 +00:00
if not request . user . is_authenticated and not PUBLIC_SNAPSHOTS :
return redirect ( f ' /admin/login/?next= { request . path } ' )
2019-05-01 03:13:21 +00:00
try :
slug , archivefile = path . split ( ' / ' , 1 )
except ( IndexError , ValueError ) :
slug , archivefile = path . split ( ' / ' , 1 ) [ 0 ] , ' index.html '
# slug is a timestamp
2021-02-16 01:52:08 +00:00
if slug . replace ( ' . ' , ' ' ) . isdigit ( ) :
# missing trailing slash -> redirect to index
if ' / ' not in path :
return redirect ( f ' { path } /index.html ' )
try :
try :
2021-02-18 13:04:50 +00:00
snapshot = Snapshot . objects . get ( Q ( timestamp = slug ) | Q ( id__startswith = slug ) )
2021-02-16 20:51:56 +00:00
response = static . serve ( request , archivefile , document_root = snapshot . link_dir , show_indexes = True )
response [ " Link " ] = f ' < { snapshot . url } >; rel= " canonical " '
return response
2021-02-16 01:52:08 +00:00
except Snapshot . DoesNotExist :
if Snapshot . objects . filter ( timestamp__startswith = slug ) . exists ( ) :
raise Snapshot . MultipleObjectsReturned
2021-02-16 20:51:56 +00:00
else :
raise
2021-02-16 01:52:08 +00:00
except Snapshot . DoesNotExist :
# Snapshot does not exist
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-18 13:04:50 +00:00
' No Snapshot directories match the given timestamp or UUID: <code> {} </code><br/><br/> '
2021-02-16 01:52:08 +00:00
' You can <a href= " /add/ " target= " _top " >add a new Snapshot</a>, or return to the <a href= " / " target= " _top " >Main Index</a> '
' </center> '
) ,
slug ,
path ,
) ,
content_type = " text/html " ,
status = 404 ,
)
except Snapshot . MultipleObjectsReturned :
snapshot_hrefs = mark_safe ( ' <br/> ' ) . join (
format_html (
' {} <a href= " /archive/ {} /index.html " ><b><code> {} </code></b></a> {} <b> {} </b> ' ,
snap . added . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
snap . timestamp ,
snap . timestamp ,
snap . url ,
snap . title or ' ' ,
)
for snap in Snapshot . objects . filter ( timestamp__startswith = slug ) . only ( ' url ' , ' timestamp ' , ' title ' , ' added ' ) . order_by ( ' -added ' )
)
return HttpResponse (
format_html (
(
2021-02-16 20:51:56 +00:00
' Multiple Snapshots match the given timestamp/UUID <code> {} </code><br/><pre> '
2021-02-16 01:52:08 +00:00
) ,
slug ,
) + snapshot_hrefs + format_html (
(
' </pre><br/> '
' Choose a Snapshot to proceed or go back to the <a href= " / " target= " _top " >Main Index</a> '
)
) ,
content_type = " text/html " ,
status = 404 ,
)
except Http404 :
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-18 13:04:50 +00:00
f ' Snapshot <a href= " /archive/ { snapshot . timestamp } /index.html " target= " _top " ><b><code>[ { snapshot . timestamp } ]</code></b></a> exists in DB, but resource <b><code> { snapshot . timestamp } / '
' {} '
f ' </code></b> does not exist in <a href= " /archive/ { snapshot . timestamp } / " target= " _top " >snapshot dir</a> yet.<br/><br/> '
' Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/> '
f ' <pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp { snapshot . timestamp } </code></pre><br/><br/> '
' <div class= " text-align: left; width: 100 % ; max-width: 400px " > '
' <i><b>Next steps:</i></b><br/> '
f ' - list all the <a href= " /archive/ { snapshot . timestamp } / " target= " _top " >Snapshot files <code>.*</code></a><br/> '
f ' - view the <a href= " /archive/ { snapshot . timestamp } /index.html " target= " _top " >Snapshot <code>./index.html</code></a><br/> '
f ' - go to the <a href= " /admin/core/snapshot/ { snapshot . id } /change/ " target= " _top " >Snapshot admin</a> to edit<br/> '
f ' - go to the <a href= " /admin/core/snapshot/?id__startswith= { snapshot . id } " target= " _top " >Snapshot actions</a> to re-archive<br/> '
' - or return to <a href= " / " target= " _top " >the main index...</a></div> '
2021-02-16 01:52:08 +00:00
' </center> '
) ,
archivefile ,
) ,
content_type = " text/html " ,
status = 404 ,
)
2019-05-01 03:13:21 +00:00
# slug is a URL
2021-02-16 20:51:56 +00:00
try :
2021-02-16 01:52:08 +00:00
try :
2021-02-16 20:51:56 +00:00
# try exact match on full url first
snapshot = Snapshot . objects . get (
2021-02-18 13:04:50 +00:00
Q ( url = ' http:// ' + path ) | Q ( url = ' https:// ' + path ) | Q ( id__startswith = path )
2021-02-16 20:51:56 +00:00
)
except Snapshot . DoesNotExist :
# fall back to match on exact base_url
2021-02-16 01:52:08 +00:00
try :
snapshot = Snapshot . objects . get (
2021-02-16 20:51:56 +00:00
Q ( url = ' http:// ' + base_url ( path ) ) | Q ( url = ' https:// ' + base_url ( path ) )
2021-02-16 01:52:08 +00:00
)
except Snapshot . DoesNotExist :
2021-02-16 20:51:56 +00:00
# fall back to matching base_url as prefix
snapshot = Snapshot . objects . get (
2021-02-16 01:52:08 +00:00
Q ( url__startswith = ' http:// ' + base_url ( path ) ) | Q ( url__startswith = ' https:// ' + base_url ( path ) )
2021-02-16 20:51:56 +00:00
)
return redirect ( f ' /archive/ { snapshot . timestamp } /index.html ' )
except Snapshot . DoesNotExist :
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-18 07:34:03 +00:00
' No Snapshots match the given url: <code> {} </code><br/><br/><br/> '
' Return to the <a href= " / " target= " _top " >Main Index</a>, or:<br/><br/> '
' + <i><a href= " /add/?url= {} " target= " _top " >Add a new Snapshot for <code> {} </code></a><br/><br/></i> '
2021-02-16 20:51:56 +00:00
' </center> '
2021-02-16 01:52:08 +00:00
) ,
2021-02-16 20:51:56 +00:00
base_url ( path ) ,
2021-02-18 07:34:03 +00:00
path if ' :// ' in path else f ' https:// { path } ' ,
2021-02-16 20:51:56 +00:00
path ,
) ,
content_type = " text/html " ,
status = 404 ,
)
except Snapshot . MultipleObjectsReturned :
snapshot_hrefs = mark_safe ( ' <br/> ' ) . join (
format_html (
' {} <a href= " /archive/ {} /index.html " ><b><code> {} </code></b></a> {} <b> {} </b> ' ,
snap . added . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
snap . timestamp ,
snap . timestamp ,
snap . url ,
snap . title or ' ' ,
2021-02-16 01:52:08 +00:00
)
2021-02-16 20:51:56 +00:00
for snap in Snapshot . objects . filter (
Q ( url__startswith = ' http:// ' + base_url ( path ) ) | Q ( url__startswith = ' https:// ' + base_url ( path ) )
) . only ( ' url ' , ' timestamp ' , ' title ' , ' added ' ) . order_by ( ' -added ' )
)
return HttpResponse (
format_html (
(
' Multiple Snapshots match the given URL <code> {} </code><br/><pre> '
) ,
base_url ( path ) ,
) + snapshot_hrefs + format_html (
(
' </pre><br/> '
' Choose a Snapshot to proceed or go back to the <a href= " / " target= " _top " >Main Index</a> '
)
) ,
content_type = " text/html " ,
status = 404 ,
)
2021-10-03 17:12:03 +00:00
2020-08-20 14:04:34 +00:00
2021-01-30 10:35:07 +00:00
class PublicIndexView ( ListView ) :
template_name = ' public_index.html '
2020-08-20 14:04:34 +00:00
model = Snapshot
2021-02-16 01:42:00 +00:00
paginate_by = SNAPSHOTS_PER_PAGE
2021-04-01 06:22:15 +00:00
ordering = [ ' -added ' ]
2020-08-20 20:43:28 +00:00
2020-11-28 06:29:34 +00:00
def get_context_data ( self , * * kwargs ) :
return {
* * super ( ) . get_context_data ( * * kwargs ) ,
' VERSION ' : VERSION ,
' FOOTER_INFO ' : FOOTER_INFO ,
}
2021-10-03 17:12:03 +00:00
def get_queryset ( self , * * kwargs ) :
2021-02-16 06:26:26 +00:00
qs = super ( ) . get_queryset ( * * kwargs )
2020-08-25 19:31:09 +00:00
query = self . request . GET . get ( ' q ' )
2021-04-10 09:13:56 +00:00
if query and query . strip ( ) :
2021-01-29 14:08:03 +00:00
qs = qs . filter ( Q ( title__icontains = query ) | Q ( url__icontains = query ) | Q ( timestamp__icontains = query ) | Q ( tags__name__icontains = query ) )
2021-04-10 12:18:13 +00:00
try :
qs = qs | query_search_index ( query )
except Exception as err :
print ( f ' [!] Error while using search backend: { err . __class__ . __name__ } { err } ' )
2020-08-20 20:43:28 +00:00
return qs
def get ( self , * args , * * kwargs ) :
if PUBLIC_INDEX or self . request . user . is_authenticated :
response = super ( ) . get ( * args , * * kwargs )
return response
else :
return redirect ( f ' /admin/login/?next= { self . request . path } ' )
2021-07-02 00:55:51 +00:00
@method_decorator ( csrf_exempt , name = ' dispatch ' )
2020-08-28 14:58:32 +00:00
class AddView ( UserPassesTestMixin , FormView ) :
2021-01-30 10:35:07 +00:00
template_name = " add.html "
2020-08-28 14:58:32 +00:00
form_class = AddLinkForm
2020-10-03 19:57:55 +00:00
def get_initial ( self ) :
""" Prefill the AddLinkForm with the ' url ' GET parameter """
if self . request . method == ' GET ' :
url = self . request . GET . get ( ' url ' , None )
if url :
2021-02-18 07:34:03 +00:00
return { ' url ' : url if ' :// ' in url else f ' https:// { url } ' }
2021-10-03 17:12:03 +00:00
2021-02-18 07:34:03 +00:00
return super ( ) . get_initial ( )
2020-10-03 19:57:55 +00:00
2020-08-28 14:58:32 +00:00
def test_func ( self ) :
return PUBLIC_ADD_VIEW or self . request . user . is_authenticated
2020-11-28 07:01:53 +00:00
def get_context_data ( self , * * kwargs ) :
return {
* * super ( ) . get_context_data ( * * kwargs ) ,
' title ' : " Add URLs " ,
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
' absolute_add_path ' : self . request . build_absolute_uri ( self . request . path ) ,
' VERSION ' : VERSION ,
' FOOTER_INFO ' : FOOTER_INFO ,
2021-02-16 01:54:47 +00:00
' stdout ' : ' ' ,
2020-11-28 07:01:53 +00:00
}
2020-08-28 14:58:32 +00:00
def form_valid ( self , form ) :
url = form . cleaned_data [ " url " ]
print ( f ' [+] Adding URL: { url } ' )
2021-04-01 06:34:16 +00:00
parser = form . cleaned_data [ " parser " ]
2021-03-27 08:30:15 +00:00
tag = form . cleaned_data [ " tag " ]
2020-08-28 14:58:32 +00:00
depth = 0 if form . cleaned_data [ " depth " ] == " 0 " else 1
2020-12-10 17:45:30 +00:00
extractors = ' , ' . join ( form . cleaned_data [ " archive_methods " ] )
2020-08-28 14:58:32 +00:00
input_kwargs = {
" urls " : url ,
2021-03-27 08:30:15 +00:00
" tag " : tag ,
2020-08-28 14:58:32 +00:00
" depth " : depth ,
2021-04-01 06:34:16 +00:00
" parser " : parser ,
2020-08-28 14:58:32 +00:00
" update_all " : False ,
" out_dir " : OUTPUT_DIR ,
}
2020-12-10 15:51:57 +00:00
if extractors :
2020-12-10 16:08:27 +00:00
input_kwargs . update ( { " extractors " : extractors } )
2020-08-28 14:58:32 +00:00
add_stdout = StringIO ( )
with redirect_stdout ( add_stdout ) :
add ( * * input_kwargs )
print ( add_stdout . getvalue ( ) )
context = self . get_context_data ( )
context . update ( {
" stdout " : ansi_to_html ( add_stdout . getvalue ( ) . strip ( ) ) ,
" form " : AddLinkForm ( )
} )
2020-08-28 15:06:48 +00:00
return render ( template_name = self . template_name , request = self . request , context = context )
2021-10-03 17:12:03 +00:00
class HealthCheckView ( View ) :
"""
A Django view that renders plain text " OK " for service discovery tools
"""
def get ( self , request ) :
"""
Handle a GET request
"""
return HttpResponse (
' OK ' ,
content_type = ' text/plain ' ,
status = 200
)