2019-05-01 03:13:21 +00:00
__package__ = ' archivebox.core '
2019-04-02 20:36:41 +00:00
2024-05-06 18:06:42 +00:00
from typing import Callable
2020-08-25 18:15:42 +00:00
from io import StringIO
2024-05-12 05:33:02 +00:00
from pathlib import Path
2020-08-25 18:15:42 +00:00
from contextlib import redirect_stdout
2019-05-01 03:13:21 +00:00
from django . shortcuts import render , redirect
2024-05-06 18:06:42 +00:00
from django . http import HttpRequest , HttpResponse , Http404
2021-02-16 01:52:08 +00:00
from django . utils . html import format_html , mark_safe
2019-05-01 03:13:21 +00:00
from django . views import View , static
2020-08-20 14:04:34 +00:00
from django . views . generic . list import ListView
2020-08-28 14:58:32 +00:00
from django . views . generic import FormView
2021-01-29 14:08:03 +00:00
from django . db . models import Q
2020-08-28 14:58:32 +00:00
from django . contrib . auth . mixins import UserPassesTestMixin
2021-07-02 00:55:51 +00:00
from django . views . decorators . csrf import csrf_exempt
from django . utils . decorators import method_decorator
2020-08-20 20:43:28 +00:00
2024-05-06 18:06:42 +00:00
from admin_data_views . typing import TableContext , ItemContext
from admin_data_views . utils import render_with_table_view , render_with_item_view , ItemLink
2019-05-01 03:44:51 +00:00
from core . models import Snapshot
2020-08-25 18:15:42 +00:00
from core . forms import AddLinkForm
2019-05-02 23:15:16 +00:00
from . . config import (
OUTPUT_DIR ,
PUBLIC_INDEX ,
PUBLIC_SNAPSHOTS ,
2020-11-28 06:29:34 +00:00
PUBLIC_ADD_VIEW ,
VERSION ,
2024-01-04 04:09:04 +00:00
COMMIT_HASH ,
2020-11-28 06:29:34 +00:00
FOOTER_INFO ,
2021-02-16 01:42:00 +00:00
SNAPSHOTS_PER_PAGE ,
2024-05-06 18:06:42 +00:00
CONFIG ,
CONFIG_SCHEMA ,
DYNAMIC_CONFIG_SCHEMA ,
USER_CONFIG ,
2024-05-12 05:33:02 +00:00
SAVE_ARCHIVE_DOT_ORG ,
PREVIEW_ORIGINALS ,
2019-05-02 23:15:16 +00:00
)
2024-05-12 05:33:02 +00:00
from . . logging_util import printable_filesize
2021-04-10 09:13:56 +00:00
from . . main import add
2024-05-12 05:33:02 +00:00
from . . util import base_url , ansi_to_html , htmlencode , urldecode , urlencode , ts_to_date_str
2021-04-10 09:13:56 +00:00
from . . search import query_search_index
2024-05-12 05:33:02 +00:00
from . . extractors . wget import wget_output_path
2020-07-02 20:54:25 +00:00
2019-04-17 09:42:21 +00:00
2021-01-30 10:35:07 +00:00
class HomepageView ( View ) :
2019-04-17 09:42:21 +00:00
def get ( self , request ) :
2020-07-28 03:56:35 +00:00
if request . user . is_authenticated :
return redirect ( ' /admin/core/snapshot/ ' )
if PUBLIC_INDEX :
2021-01-30 10:35:07 +00:00
return redirect ( ' /public ' )
2021-10-03 17:12:03 +00:00
2020-07-28 03:56:35 +00:00
return redirect ( f ' /admin/login/?next= { request . path } ' )
2019-04-22 23:08:01 +00:00
2021-01-30 10:35:07 +00:00
class SnapshotView ( View ) :
# render static html index from filesystem archive/<timestamp>/index.html
2024-05-12 05:33:02 +00:00
@staticmethod
def render_live_index ( request , snapshot ) :
TITLE_LOADING_MSG = ' Not yet archived... '
HIDDEN_RESULTS = ( ' favicon ' , ' headers ' , ' title ' , ' htmltotext ' , ' warc ' , ' archive_org ' )
archiveresults = { }
results = snapshot . archiveresult_set . all ( )
for result in results :
embed_path = result . embed_path ( )
abs_path = result . snapshot_dir / ( embed_path or ' None ' )
if ( result . status == ' succeeded '
and ( result . extractor not in HIDDEN_RESULTS )
and embed_path
and abs_path . exists ( ) ) :
if abs_path . is_dir ( ) and not any ( abs_path . glob ( ' *.* ' ) ) :
continue
result_info = {
' name ' : result . extractor ,
' path ' : embed_path ,
' ts ' : ts_to_date_str ( result . end_ts ) ,
2024-06-03 09:31:35 +00:00
' size ' : abs_path . stat ( ) . st_size or ' ? ' ,
2024-05-12 05:33:02 +00:00
}
archiveresults [ result . extractor ] = result_info
2024-06-03 09:31:35 +00:00
existing_files = { result [ ' path ' ] for result in archiveresults . values ( ) }
2024-06-03 11:00:18 +00:00
min_size_threshold = 10_000 # bytes
2024-06-03 09:31:35 +00:00
allowed_extensions = {
' txt ' ,
' html ' ,
' htm ' ,
' png ' ,
' jpg ' ,
' jpeg ' ,
' gif ' ,
' webp '
' svg ' ,
' webm ' ,
' mp4 ' ,
' mp3 ' ,
2024-08-18 02:31:34 +00:00
' opus ' ,
2024-06-03 09:31:35 +00:00
' pdf ' ,
' md ' ,
}
2024-06-03 11:00:18 +00:00
2024-08-18 02:31:34 +00:00
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
2024-06-03 11:00:18 +00:00
snap_dir = Path ( snapshot . link_dir )
for result_file in ( * snap_dir . glob ( ' * ' ) , * snap_dir . glob ( ' */* ' ) ) :
2024-06-03 09:31:35 +00:00
extension = result_file . suffix . lstrip ( ' . ' ) . lower ( )
if result_file . is_dir ( ) or result_file . name . startswith ( ' . ' ) or extension not in allowed_extensions :
continue
2024-06-03 11:00:18 +00:00
if result_file . name in existing_files or result_file . name == ' index.html ' :
2024-06-03 09:31:35 +00:00
continue
file_size = result_file . stat ( ) . st_size or 0
if file_size > min_size_threshold :
archiveresults [ result_file . name ] = {
' name ' : result_file . stem ,
2024-06-03 11:00:18 +00:00
' path ' : result_file . relative_to ( snap_dir ) ,
2024-06-03 09:31:35 +00:00
' ts ' : ts_to_date_str ( result_file . stat ( ) . st_mtime or 0 ) ,
' size ' : file_size ,
}
2024-08-18 02:31:34 +00:00
preferred_types = ( ' singlefile ' , ' screenshot ' , ' wget ' , ' dom ' , ' media ' , ' pdf ' , ' readability ' , ' mercury ' )
2024-05-12 05:33:02 +00:00
all_types = preferred_types + tuple ( result_type for result_type in archiveresults . keys ( ) if result_type not in preferred_types )
best_result = { ' path ' : ' None ' }
for result_type in preferred_types :
if result_type in archiveresults :
best_result = archiveresults [ result_type ]
break
link = snapshot . as_link ( )
link_info = link . _asdict ( extended = True )
try :
2024-06-03 11:00:18 +00:00
warc_path = ' warc/ ' + list ( Path ( snap_dir ) . glob ( ' warc/*.warc.* ' ) ) [ 0 ] . name
2024-05-12 05:33:02 +00:00
except IndexError :
warc_path = ' warc/ '
context = {
* * link_info ,
* * link_info [ ' canonical ' ] ,
' title ' : htmlencode (
link . title
or ( link . base_url if link . is_archived else TITLE_LOADING_MSG )
) ,
' extension ' : link . extension or ' html ' ,
' tags ' : link . tags or ' untagged ' ,
' size ' : printable_filesize ( link . archive_size ) if link . archive_size else ' pending ' ,
' status ' : ' archived ' if link . is_archived else ' not yet archived ' ,
' status_color ' : ' success ' if link . is_archived else ' danger ' ,
' oldest_archive_date ' : ts_to_date_str ( link . oldest_archive_date ) ,
' warc_path ' : warc_path ,
' SAVE_ARCHIVE_DOT_ORG ' : SAVE_ARCHIVE_DOT_ORG ,
' PREVIEW_ORIGINALS ' : PREVIEW_ORIGINALS ,
2024-06-03 11:00:18 +00:00
' archiveresults ' : sorted ( archiveresults . values ( ) , key = lambda r : all_types . index ( r [ ' name ' ] ) if r [ ' name ' ] in all_types else - r [ ' size ' ] ) ,
2024-05-12 05:33:02 +00:00
' best_result ' : best_result ,
# 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
}
return render ( template_name = ' core/snapshot_live.html ' , request = request , context = context )
2019-05-01 03:13:21 +00:00
def get ( self , request , path ) :
2019-05-02 23:15:16 +00:00
if not request . user . is_authenticated and not PUBLIC_SNAPSHOTS :
return redirect ( f ' /admin/login/?next= { request . path } ' )
2024-05-12 05:33:02 +00:00
snapshot = None
2019-05-01 03:13:21 +00:00
try :
slug , archivefile = path . split ( ' / ' , 1 )
except ( IndexError , ValueError ) :
slug , archivefile = path . split ( ' / ' , 1 ) [ 0 ] , ' index.html '
2024-08-20 08:58:19 +00:00
2019-05-01 03:13:21 +00:00
# slug is a timestamp
2021-02-16 01:52:08 +00:00
if slug . replace ( ' . ' , ' ' ) . isdigit ( ) :
# missing trailing slash -> redirect to index
if ' / ' not in path :
return redirect ( f ' { path } /index.html ' )
try :
try :
2021-02-18 13:04:50 +00:00
snapshot = Snapshot . objects . get ( Q ( timestamp = slug ) | Q ( id__startswith = slug ) )
2024-05-12 05:33:02 +00:00
if archivefile == ' index.html ' :
# if they requested snapshot index, serve live rendered template instead of static html
response = self . render_live_index ( request , snapshot )
else :
response = static . serve ( request , archivefile , document_root = snapshot . link_dir , show_indexes = True )
2021-02-16 20:51:56 +00:00
response [ " Link " ] = f ' < { snapshot . url } >; rel= " canonical " '
return response
2021-02-16 01:52:08 +00:00
except Snapshot . DoesNotExist :
if Snapshot . objects . filter ( timestamp__startswith = slug ) . exists ( ) :
raise Snapshot . MultipleObjectsReturned
2021-02-16 20:51:56 +00:00
else :
raise
2021-02-16 01:52:08 +00:00
except Snapshot . DoesNotExist :
# Snapshot does not exist
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-18 13:04:50 +00:00
' No Snapshot directories match the given timestamp or UUID: <code> {} </code><br/><br/> '
2021-02-16 01:52:08 +00:00
' You can <a href= " /add/ " target= " _top " >add a new Snapshot</a>, or return to the <a href= " / " target= " _top " >Main Index</a> '
' </center> '
) ,
slug ,
path ,
) ,
content_type = " text/html " ,
status = 404 ,
)
except Snapshot . MultipleObjectsReturned :
snapshot_hrefs = mark_safe ( ' <br/> ' ) . join (
format_html (
' {} <a href= " /archive/ {} /index.html " ><b><code> {} </code></b></a> {} <b> {} </b> ' ,
snap . added . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
snap . timestamp ,
snap . timestamp ,
snap . url ,
2024-08-20 08:58:36 +00:00
snap . title_stripped [ : 64 ] or ' ' ,
2021-02-16 01:52:08 +00:00
)
for snap in Snapshot . objects . filter ( timestamp__startswith = slug ) . only ( ' url ' , ' timestamp ' , ' title ' , ' added ' ) . order_by ( ' -added ' )
)
return HttpResponse (
format_html (
(
2021-02-16 20:51:56 +00:00
' Multiple Snapshots match the given timestamp/UUID <code> {} </code><br/><pre> '
2021-02-16 01:52:08 +00:00
) ,
slug ,
) + snapshot_hrefs + format_html (
(
' </pre><br/> '
' Choose a Snapshot to proceed or go back to the <a href= " / " target= " _top " >Main Index</a> '
)
) ,
content_type = " text/html " ,
status = 404 ,
)
except Http404 :
2024-05-12 05:33:17 +00:00
assert snapshot # (Snapshot.DoesNotExist is already handled above)
2021-02-16 01:52:08 +00:00
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2024-05-12 05:33:17 +00:00
f ' Snapshot <a href= " /archive/ { snapshot . timestamp } /index.html " target= " _top " ><b><code>[ { snapshot . timestamp } ]</code></b></a>: <a href= " { snapshot . url } " target= " _blank " rel= " noreferrer " > { snapshot . url } </a><br/> '
f ' was queued on { str ( snapshot . added ) . split ( " . " ) [ 0 ] } , '
f ' but no files have been saved yet in:<br/><b><a href= " /archive/ { snapshot . timestamp } / " target= " _top " ><code> { snapshot . timestamp } </code></a><code>/ '
2021-02-18 13:04:50 +00:00
' {} '
2024-05-12 05:33:17 +00:00
f ' </code></b><br/><br/> '
' It \' s possible {} '
f ' during the last capture on { str ( snapshot . added ) . split ( " . " ) [ 0 ] } ,<br/>or that the archiving process has not completed yet.<br/> '
f ' <pre><code># run this cmd to finish/retry archiving this Snapshot</code><br/> '
f ' <code style= " user-select: all; color: #333 " >archivebox update -t timestamp { snapshot . timestamp } </code></pre><br/><br/> '
2021-02-18 13:04:50 +00:00
' <div class= " text-align: left; width: 100 % ; max-width: 400px " > '
' <i><b>Next steps:</i></b><br/> '
f ' - list all the <a href= " /archive/ { snapshot . timestamp } / " target= " _top " >Snapshot files <code>.*</code></a><br/> '
f ' - view the <a href= " /archive/ { snapshot . timestamp } /index.html " target= " _top " >Snapshot <code>./index.html</code></a><br/> '
2024-05-13 12:12:12 +00:00
f ' - go to the <a href= " /admin/core/snapshot/ { snapshot . pk } /change/ " target= " _top " >Snapshot admin</a> to edit<br/> '
f ' - go to the <a href= " /admin/core/snapshot/?uuid__startswith= { snapshot . uuid } " target= " _top " >Snapshot actions</a> to re-archive<br/> '
2021-02-18 13:04:50 +00:00
' - or return to <a href= " / " target= " _top " >the main index...</a></div> '
2021-02-16 01:52:08 +00:00
' </center> '
) ,
2024-05-12 05:33:17 +00:00
archivefile if str ( archivefile ) != ' None ' else ' ' ,
f ' the { archivefile } resource could not be fetched ' if str ( archivefile ) != ' None ' else ' the original site was not available ' ,
2021-02-16 01:52:08 +00:00
) ,
content_type = " text/html " ,
status = 404 ,
)
2024-08-20 08:58:36 +00:00
# # slud is an ID
# ulid = slug.split('_', 1)[-1]
# try:
# try:
# snapshot = snapshot or Snapshot.objects.get(Q(abid=ulid) | Q(id=ulid) | Q(old_id=ulid))
# except Snapshot.DoesNotExist:
# pass
# try:
# snapshot = Snapshot.objects.get(Q(abid__startswith=slug) | Q(abid__startswith=Snapshot.abid_prefix + slug) | Q(id__startswith=slug) | Q(old_id__startswith=slug))
# except (Snapshot.DoesNotExist, Snapshot.MultipleObjectsReturned):
# pass
# try:
# snapshot = snapshot or Snapshot.objects.get(Q(abid__icontains=snapshot_id) | Q(id__icontains=snapshot_id) | Q(old_id__icontains=snapshot_id))
# except Snapshot.DoesNotExist:
# pass
# return redirect(f'/archive/{snapshot.timestamp}/index.html')
# except Snapshot.DoesNotExist:
# pass
2019-05-01 03:13:21 +00:00
# slug is a URL
2021-02-16 20:51:56 +00:00
try :
2021-02-16 01:52:08 +00:00
try :
2024-08-20 08:58:36 +00:00
# try exact match on full url / ABID first
2021-02-16 20:51:56 +00:00
snapshot = Snapshot . objects . get (
2021-02-18 13:04:50 +00:00
Q ( url = ' http:// ' + path ) | Q ( url = ' https:// ' + path ) | Q ( id__startswith = path )
2024-08-20 08:58:36 +00:00
| Q ( abid__icontains = path ) | Q ( id__icontains = path ) | Q ( old_id__icontains = path )
2021-02-16 20:51:56 +00:00
)
except Snapshot . DoesNotExist :
# fall back to match on exact base_url
2021-02-16 01:52:08 +00:00
try :
snapshot = Snapshot . objects . get (
2021-02-16 20:51:56 +00:00
Q ( url = ' http:// ' + base_url ( path ) ) | Q ( url = ' https:// ' + base_url ( path ) )
2021-02-16 01:52:08 +00:00
)
except Snapshot . DoesNotExist :
2021-02-16 20:51:56 +00:00
# fall back to matching base_url as prefix
snapshot = Snapshot . objects . get (
2021-02-16 01:52:08 +00:00
Q ( url__startswith = ' http:// ' + base_url ( path ) ) | Q ( url__startswith = ' https:// ' + base_url ( path ) )
2021-02-16 20:51:56 +00:00
)
return redirect ( f ' /archive/ { snapshot . timestamp } /index.html ' )
except Snapshot . DoesNotExist :
return HttpResponse (
format_html (
(
' <center><br/><br/><br/> '
2021-02-18 07:34:03 +00:00
' No Snapshots match the given url: <code> {} </code><br/><br/><br/> '
' Return to the <a href= " / " target= " _top " >Main Index</a>, or:<br/><br/> '
' + <i><a href= " /add/?url= {} " target= " _top " >Add a new Snapshot for <code> {} </code></a><br/><br/></i> '
2021-02-16 20:51:56 +00:00
' </center> '
2021-02-16 01:52:08 +00:00
) ,
2021-02-16 20:51:56 +00:00
base_url ( path ) ,
2021-02-18 07:34:03 +00:00
path if ' :// ' in path else f ' https:// { path } ' ,
2021-02-16 20:51:56 +00:00
path ,
) ,
content_type = " text/html " ,
status = 404 ,
)
except Snapshot . MultipleObjectsReturned :
snapshot_hrefs = mark_safe ( ' <br/> ' ) . join (
format_html (
2024-08-20 08:58:36 +00:00
' {} <code style= " font-size: 0.8em " > {} </code> <a href= " /archive/ {} /index.html " ><b><code> {} </code></b></a> {} <b> {} </b> ' ,
2021-02-16 20:51:56 +00:00
snap . added . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
2024-08-20 08:58:36 +00:00
snap . abid ,
2021-02-16 20:51:56 +00:00
snap . timestamp ,
snap . timestamp ,
snap . url ,
2024-08-20 08:58:36 +00:00
snap . title_stripped [ : 64 ] or ' ' ,
2021-02-16 01:52:08 +00:00
)
2021-02-16 20:51:56 +00:00
for snap in Snapshot . objects . filter (
Q ( url__startswith = ' http:// ' + base_url ( path ) ) | Q ( url__startswith = ' https:// ' + base_url ( path ) )
2024-08-20 08:58:36 +00:00
| Q ( abid__icontains = path ) | Q ( id__icontains = path ) | Q ( old_id__icontains = path )
2021-02-16 20:51:56 +00:00
) . only ( ' url ' , ' timestamp ' , ' title ' , ' added ' ) . order_by ( ' -added ' )
)
return HttpResponse (
format_html (
(
' Multiple Snapshots match the given URL <code> {} </code><br/><pre> '
) ,
base_url ( path ) ,
) + snapshot_hrefs + format_html (
(
' </pre><br/> '
' Choose a Snapshot to proceed or go back to the <a href= " / " target= " _top " >Main Index</a> '
)
) ,
content_type = " text/html " ,
status = 404 ,
)
2021-10-03 17:12:03 +00:00
2020-08-20 14:04:34 +00:00
2021-01-30 10:35:07 +00:00
class PublicIndexView ( ListView ) :
template_name = ' public_index.html '
2020-08-20 14:04:34 +00:00
model = Snapshot
2021-02-16 01:42:00 +00:00
paginate_by = SNAPSHOTS_PER_PAGE
2021-04-01 06:22:15 +00:00
ordering = [ ' -added ' ]
2020-08-20 20:43:28 +00:00
2020-11-28 06:29:34 +00:00
def get_context_data ( self , * * kwargs ) :
return {
* * super ( ) . get_context_data ( * * kwargs ) ,
' VERSION ' : VERSION ,
2024-01-04 04:09:04 +00:00
' COMMIT_HASH ' : COMMIT_HASH ,
2020-11-28 06:29:34 +00:00
' FOOTER_INFO ' : FOOTER_INFO ,
}
2021-10-03 17:12:03 +00:00
def get_queryset ( self , * * kwargs ) :
2021-02-16 06:26:26 +00:00
qs = super ( ) . get_queryset ( * * kwargs )
2024-06-19 14:50:16 +00:00
query = self . request . GET . get ( ' q ' , default = ' ' ) . strip ( )
if not query :
return qs . distinct ( )
query_type = self . request . GET . get ( ' query_type ' )
if not query_type or query_type == ' all ' :
2021-01-29 14:08:03 +00:00
qs = qs . filter ( Q ( title__icontains = query ) | Q ( url__icontains = query ) | Q ( timestamp__icontains = query ) | Q ( tags__name__icontains = query ) )
2021-04-10 12:18:13 +00:00
try :
qs = qs | query_search_index ( query )
except Exception as err :
print ( f ' [!] Error while using search backend: { err . __class__ . __name__ } { err } ' )
2024-06-19 14:50:16 +00:00
elif query_type == ' fulltext ' :
try :
qs = qs | query_search_index ( query )
except Exception as err :
print ( f ' [!] Error while using search backend: { err . __class__ . __name__ } { err } ' )
elif query_type == ' meta ' :
qs = qs . filter ( Q ( title__icontains = query ) | Q ( url__icontains = query ) | Q ( timestamp__icontains = query ) | Q ( tags__name__icontains = query ) )
elif query_type == ' url ' :
qs = qs . filter ( Q ( url__icontains = query ) )
elif query_type == ' title ' :
qs = qs . filter ( Q ( title__icontains = query ) )
elif query_type == ' timestamp ' :
qs = qs . filter ( Q ( timestamp__icontains = query ) )
elif query_type == ' tags ' :
qs = qs . filter ( Q ( tags__name__icontains = query ) )
else :
print ( f ' [!] Unknown value for query_type: " { query_type } " ' )
2024-01-19 19:49:09 +00:00
return qs . distinct ( )
2020-08-20 20:43:28 +00:00
def get ( self , * args , * * kwargs ) :
if PUBLIC_INDEX or self . request . user . is_authenticated :
response = super ( ) . get ( * args , * * kwargs )
return response
else :
return redirect ( f ' /admin/login/?next= { self . request . path } ' )
2021-07-02 00:55:51 +00:00
@method_decorator ( csrf_exempt , name = ' dispatch ' )
2020-08-28 14:58:32 +00:00
class AddView ( UserPassesTestMixin , FormView ) :
2021-01-30 10:35:07 +00:00
template_name = " add.html "
2020-08-28 14:58:32 +00:00
form_class = AddLinkForm
2020-10-03 19:57:55 +00:00
def get_initial ( self ) :
""" Prefill the AddLinkForm with the ' url ' GET parameter """
if self . request . method == ' GET ' :
url = self . request . GET . get ( ' url ' , None )
if url :
2021-02-18 07:34:03 +00:00
return { ' url ' : url if ' :// ' in url else f ' https:// { url } ' }
2021-10-03 17:12:03 +00:00
2021-02-18 07:34:03 +00:00
return super ( ) . get_initial ( )
2020-10-03 19:57:55 +00:00
2020-08-28 14:58:32 +00:00
def test_func ( self ) :
return PUBLIC_ADD_VIEW or self . request . user . is_authenticated
2020-11-28 07:01:53 +00:00
def get_context_data ( self , * * kwargs ) :
return {
* * super ( ) . get_context_data ( * * kwargs ) ,
' title ' : " Add URLs " ,
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
' absolute_add_path ' : self . request . build_absolute_uri ( self . request . path ) ,
' VERSION ' : VERSION ,
' FOOTER_INFO ' : FOOTER_INFO ,
2021-02-16 01:54:47 +00:00
' stdout ' : ' ' ,
2020-11-28 07:01:53 +00:00
}
2020-08-28 14:58:32 +00:00
def form_valid ( self , form ) :
url = form . cleaned_data [ " url " ]
print ( f ' [+] Adding URL: { url } ' )
2021-04-01 06:34:16 +00:00
parser = form . cleaned_data [ " parser " ]
2021-03-27 08:30:15 +00:00
tag = form . cleaned_data [ " tag " ]
2020-08-28 14:58:32 +00:00
depth = 0 if form . cleaned_data [ " depth " ] == " 0 " else 1
2020-12-10 17:45:30 +00:00
extractors = ' , ' . join ( form . cleaned_data [ " archive_methods " ] )
2020-08-28 14:58:32 +00:00
input_kwargs = {
" urls " : url ,
2021-03-27 08:30:15 +00:00
" tag " : tag ,
2020-08-28 14:58:32 +00:00
" depth " : depth ,
2021-04-01 06:34:16 +00:00
" parser " : parser ,
2020-08-28 14:58:32 +00:00
" update_all " : False ,
" out_dir " : OUTPUT_DIR ,
}
2020-12-10 15:51:57 +00:00
if extractors :
2020-12-10 16:08:27 +00:00
input_kwargs . update ( { " extractors " : extractors } )
2020-08-28 14:58:32 +00:00
add_stdout = StringIO ( )
with redirect_stdout ( add_stdout ) :
add ( * * input_kwargs )
print ( add_stdout . getvalue ( ) )
context = self . get_context_data ( )
context . update ( {
" stdout " : ansi_to_html ( add_stdout . getvalue ( ) . strip ( ) ) ,
" form " : AddLinkForm ( )
} )
2020-08-28 15:06:48 +00:00
return render ( template_name = self . template_name , request = self . request , context = context )
2021-10-03 17:12:03 +00:00
class HealthCheckView ( View ) :
"""
A Django view that renders plain text " OK " for service discovery tools
"""
def get ( self , request ) :
"""
Handle a GET request
"""
return HttpResponse (
' OK ' ,
content_type = ' text/plain ' ,
status = 200
)
2024-05-06 18:06:42 +00:00
def find_config_section ( key : str ) - > str :
matching_sections = [
name for name , opts in CONFIG_SCHEMA . items ( ) if key in opts
]
section = matching_sections [ 0 ] if matching_sections else ' DYNAMIC '
return section
def find_config_default ( key : str ) - > str :
default_val = USER_CONFIG . get ( key , { } ) . get ( ' default ' , lambda : None )
if isinstance ( default_val , Callable ) :
return None
else :
default_val = repr ( default_val )
return default_val
def find_config_type ( key : str ) - > str :
if key in USER_CONFIG :
return USER_CONFIG [ key ] [ ' type ' ] . __name__
elif key in DYNAMIC_CONFIG_SCHEMA :
return type ( CONFIG [ key ] ) . __name__
return ' str '
def key_is_safe ( key : str ) - > bool :
for term in ( ' key ' , ' password ' , ' secret ' , ' token ' ) :
if term in key . lower ( ) :
return False
return True
@render_with_table_view
def live_config_list_view ( request : HttpRequest , * * kwargs ) - > TableContext :
assert request . user . is_superuser , ' Must be a superuser to view configuration settings. '
rows = {
" Section " : [ ] ,
" Key " : [ ] ,
" Type " : [ ] ,
" Value " : [ ] ,
" Default " : [ ] ,
# "Documentation": [],
" Aliases " : [ ] ,
}
for section in CONFIG_SCHEMA . keys ( ) :
for key in CONFIG_SCHEMA [ section ] . keys ( ) :
2024-05-18 03:11:54 +00:00
rows [ ' Section ' ] . append ( section ) # section.replace('_', ' ').title().replace(' Config', '')
2024-05-06 18:06:42 +00:00
rows [ ' Key ' ] . append ( ItemLink ( key , key = key ) )
rows [ ' Type ' ] . append ( mark_safe ( f ' <code> { find_config_type ( key ) } </code> ' ) )
rows [ ' Value ' ] . append ( mark_safe ( f ' <code> { CONFIG [ key ] } </code> ' ) if key_is_safe ( key ) else ' ******** (redacted) ' )
2024-05-12 05:33:27 +00:00
rows [ ' Default ' ] . append ( mark_safe ( f ' <a href= " https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27 { key } %27&type=code " ><code style= " text-decoration: underline " > { find_config_default ( key ) or " See here... " } </code></a> ' ) )
2024-05-06 18:06:42 +00:00
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows [ ' Aliases ' ] . append ( ' , ' . join ( CONFIG_SCHEMA [ section ] [ key ] . get ( ' aliases ' , [ ] ) ) )
section = ' DYNAMIC '
for key in DYNAMIC_CONFIG_SCHEMA . keys ( ) :
2024-05-18 03:11:54 +00:00
rows [ ' Section ' ] . append ( section ) # section.replace('_', ' ').title().replace(' Config', '')
2024-05-06 18:06:42 +00:00
rows [ ' Key ' ] . append ( ItemLink ( key , key = key ) )
rows [ ' Type ' ] . append ( mark_safe ( f ' <code> { find_config_type ( key ) } </code> ' ) )
rows [ ' Value ' ] . append ( mark_safe ( f ' <code> { CONFIG [ key ] } </code> ' ) if key_is_safe ( key ) else ' ******** (redacted) ' )
2024-05-12 05:33:27 +00:00
rows [ ' Default ' ] . append ( mark_safe ( f ' <a href= " https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27 { key } %27&type=code " ><code style= " text-decoration: underline " > { find_config_default ( key ) or " See here... " } </code></a> ' ) )
2024-05-06 18:06:42 +00:00
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows [ ' Aliases ' ] . append ( ItemLink ( key , key = key ) if key in USER_CONFIG else ' ' )
return TableContext (
title = " Computed Configuration Values " ,
table = rows ,
)
@render_with_item_view
def live_config_value_view ( request : HttpRequest , key : str , * * kwargs ) - > ItemContext :
assert request . user . is_superuser , ' Must be a superuser to view configuration settings. '
aliases = USER_CONFIG . get ( key , { } ) . get ( " aliases " , [ ] )
return ItemContext (
slug = key ,
title = key ,
data = [
{
" name " : mark_safe ( f ' data / ArchiveBox.conf [ { find_config_section ( key ) } ] <b><code style= " color: lightgray " > { key } </code></b> ' if key in USER_CONFIG else f ' [DYNAMIC CONFIG] <b><code style= " color: lightgray " > { key } </code></b> <small>(calculated at runtime)</small> ' ) ,
" description " : None ,
" fields " : {
' Key ' : key ,
' Type ' : find_config_type ( key ) ,
' Value ' : CONFIG [ key ] if key_is_safe ( key ) else ' ******** ' ,
} ,
" help_texts " : {
' Key ' : mark_safe ( f '''
< a href = " https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration# { key.lower()} " > Documentation < / a > & nbsp ;
< span style = " display: { " inline " if aliases else " none " } " >
Aliases : { " , " . join ( aliases ) }
< / span >
''' ),
' Type ' : mark_safe ( f '''
< a href = " https://github.com/search?q=repo % 3AArchiveBox %2F ArchiveBox+path % 3Aconfig.py+ % 27 {key} % 27&type=code " >
See full definition in < code > archivebox / config . py < / code > . . .
< / a >
''' ),
' Value ' : mark_safe ( f '''
{ ' <b style= " color: red " >Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/> ' if not key_is_safe ( key ) else ' ' }
Default : < a href = " https://github.com/search?q=repo % 3AArchiveBox %2F ArchiveBox+path % 3Aconfig.py+ % 27 {key} % 27&type=code " >
< code > { find_config_default ( key ) or ' See 1here... ' } < / code >
< / a >
< br / > < br / >
< p style = " display: { " block " if key in USER_CONFIG else " none " } " >
< i > To change this value , edit < code > data / ArchiveBox . conf < / code > or run : < / i >
< br / > < br / >
< code > archivebox config - - set { key } = " {
val . strip ( " ' " )
if ( val := find_config_default ( key ) ) else
( repr ( CONFIG [ key ] if key_is_safe ( key ) else ' ******** ' ) ) . strip ( " ' " )
} " </code>
< / p >
''' ),
} ,
} ,
] ,
)