Merge branch 'dev' into link-removal2

This commit is contained in:
Nick Sweeting 2021-02-01 02:46:57 -05:00
commit 3eaf580fc0
16 changed files with 226 additions and 129 deletions

View file

@ -73,7 +73,7 @@ archivebox help
- `archivebox add/remove/update/list` to manage Snapshots in the archive - `archivebox add/remove/update/list` to manage Snapshots in the archive
- `archivebox schedule` to pull in fresh URLs in regularly from [boorkmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats) - `archivebox schedule` to pull in fresh URLs in regularly from [boorkmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats)
- `archivebox oneshot` archive single URLs without starting a whole collection - `archivebox oneshot` archive single URLs without starting a whole collection
- `archivebox shell` open a REPL to use the [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha) - `archivebox shell/manage dbshell` open a REPL to use the [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha), or SQL API
<div align="center"> <div align="center">
<br/> <br/>
@ -639,7 +639,7 @@ archivebox config --set DEBUG=True
archivebox server --debug ... archivebox server --debug ...
``` ```
### Build and run a Github branch #### Build and run a Github branch
```bash ```bash
docker build -t archivebox:dev https://github.com/ArchiveBox/ArchiveBox.git#dev docker build -t archivebox:dev https://github.com/ArchiveBox/ArchiveBox.git#dev
@ -669,6 +669,7 @@ cd archivebox/
cd path/to/test/data/ cd path/to/test/data/
archivebox shell archivebox shell
archivebox manage dbshell
``` ```
(uses `pytest -s`) (uses `pytest -s`)

View file

@ -27,6 +27,7 @@ import re
import sys import sys
import json import json
import getpass import getpass
import platform
import shutil import shutil
import django import django
@ -51,7 +52,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SHELL_CONFIG': { 'SHELL_CONFIG': {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']}, 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False}, 'IN_DOCKER': {'type': bool, 'default': False},
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True}, # TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
}, },
@ -914,7 +915,12 @@ os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin')) NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
sys.path.append(NODE_BIN_PATH) sys.path.append(NODE_BIN_PATH)
# disable stderr "you really shouldnt disable ssl" warnings with library config
if not CONFIG['CHECK_SSL_VALIDITY']:
import urllib3
import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
########################### Config Validity Checkers ########################### ########################### Config Validity Checkers ###########################

View file

@ -99,13 +99,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size') list_display = ('added', 'title_str', 'url_str', 'files', 'size')
sort_fields = ('title_str', 'url_str', 'added') sort_fields = ('title_str', 'url_str', 'added')
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated') readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
search_fields = ['url', 'timestamp', 'title', 'tags__name'] search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
fields = (*readonly_fields, 'title', 'tags') fields = (*readonly_fields, 'title', 'tags')
list_filter = ('added', 'updated', 'tags') list_filter = ('added', 'updated', 'tags')
ordering = ['-added'] ordering = ['-added']
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
actions_template = 'admin/actions_as_select.html' actions_template = 'admin/actions_as_select.html'
form = SnapshotAdminForm form = SnapshotAdminForm
list_per_page = 40
def get_urls(self): def get_urls(self):
urls = super().get_urls() urls = super().get_urls()

View file

@ -33,6 +33,8 @@ LOGOUT_REDIRECT_URL = '/'
PASSWORD_RESET_URL = '/accounts/password_reset/' PASSWORD_RESET_URL = '/accounts/password_reset/'
APPEND_SLASH = True APPEND_SLASH = True
DEBUG = DEBUG or ('--debug' in sys.argv)
INSTALLED_APPS = [ INSTALLED_APPS = [
'django.contrib.auth', 'django.contrib.auth',
'django.contrib.contenttypes', 'django.contrib.contenttypes',

View file

@ -47,14 +47,13 @@ def save_favicon(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIM
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(snapshot.url)), 'https://www.google.com/s2/favicons?domain={}'.format(domain(snapshot.url)),
] ]
status = 'pending' status = 'failed'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
run(cmd, cwd=str(out_dir), timeout=timeout) run(cmd, cwd=str(out_dir), timeout=timeout)
chmod_file(output, cwd=str(out_dir)) chmod_file(output, cwd=str(out_dir))
status = 'succeeded' status = 'succeeded'
except Exception as err: except Exception as err:
status = 'failed'
output = err output = err
finally: finally:
timer.end() timer.end()

View file

@ -42,7 +42,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
"""download full site using single-file""" """download full site using single-file"""
out_dir = out_dir or Path(snapshot.snapshot_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
output = str(out_dir.absolute() / "singlefile.html") output = "singlefile.html"
browser_args = chrome_args(TIMEOUT=0) browser_args = chrome_args(TIMEOUT=0)
@ -54,6 +54,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
browser_args, browser_args,
snapshot.url, snapshot.url,
output output
output,
] ]
status = 'succeeded' status = 'succeeded'
@ -74,9 +75,9 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
) )
# Check for common failure cases # Check for common failure cases
if (result.returncode > 0): if (result.returncode > 0) or not (out_dir / output).is_file():
raise ArchiveError('SingleFile was not able to archive the page', hints) raise ArchiveError('SingleFile was not able to archive the page', hints)
chmod_file(output) chmod_file(output, cwd=str(out_dir))
except (Exception, OSError) as err: except (Exception, OSError) as err:
status = 'failed' status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes). # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).

View file

@ -10,7 +10,6 @@ from django.db.models import Model
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file,
download_url, download_url,
htmldecode, htmldecode,
) )
@ -65,11 +64,8 @@ class TitleParser(HTMLParser):
# output = '{title}' # output = '{title}'
@enforce_types @enforce_types
def should_save_title(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool: def should_save_title(snapshot: Model, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(snapshot.url): # if link already has valid title, skip it
False
# if snapshot already has valid title, skip it
if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'): if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'):
return False return False
@ -118,7 +114,11 @@ def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEO
.update(title=output) .update(title=output)
snapshot.title = output snapshot.title = output
else: else:
raise ArchiveError('Unable to detect page title') # if no content was returned, dont save a title (because it might be a temporary error)
if not html:
raise ArchiveError('Unable to detect page title')
# output = html[:128] # use first bit of content as the title
output = link.base_url # use the filename as the title (better UX)
except Exception as err: except Exception as err:
status = 'failed' status = 'failed'
output = err output = err

View file

@ -12,8 +12,6 @@ from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file,
without_scheme,
without_fragment, without_fragment,
without_query, without_query,
path, path,
@ -107,7 +105,12 @@ def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOU
if b'ERROR 500: Internal Server Error' in result.stderr: if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints) raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Wget failed or got an error from the server', hints) raise ArchiveError('Wget failed or got an error from the server', hints)
chmod_file(output, cwd=str(out_dir))
if (out_dir / output).exists():
chmod_file(output, cwd=str(out_dir))
else:
print(f' {out_dir}/{output}')
raise ArchiveError('Failed to find wget output after running', hints)
except Exception as err: except Exception as err:
status = 'failed' status = 'failed'
output = err output = err
@ -131,8 +134,6 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
See docs on wget --adjust-extension (-E) See docs on wget --adjust-extension (-E)
""" """
if is_static_file(snapshot.url):
return without_scheme(without_fragment(snapshot.url))
# Wget downloads can save in a number of different ways depending on the url: # Wget downloads can save in a number of different ways depending on the url:
# https://example.com # https://example.com
@ -184,7 +185,7 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir(): for file_present in search_dir.iterdir():
if file_present == last_part_of_url: if file_present == last_part_of_url:
return str(search_dir / file_present) return str((search_dir / file_present).relative_to(snapshot.snapshot_dir))
# Move up one directory level # Move up one directory level
search_dir = search_dir.parent search_dir = search_dir.parent
@ -192,10 +193,15 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
if search_dir == snapshot.snapshot_dir: if search_dir == snapshot.snapshot_dir:
break break
# check for literally any file present that isnt an empty folder
domain_dir = Path(domain(snapshot.url).replace(":", "+"))
files_within = list((Path(snapshot.snapshot_dir) / domain_dir).glob('**/*.*'))
if files_within:
return str((domain_dir / files_within[-1]).relative_to(snapshot.snapshot_dir))
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path) # fallback to just the domain dir, dont try to introspect further inside it
if not search_dir.is_dir(): search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+")
return str(search_dir.relative_to(snapshot.snapshot_dir)) if search_dir.is_dir():
return domain(snapshot.url).replace(":", "+")
return None return None

View file

@ -2,7 +2,6 @@ __package__ = 'archivebox.index'
import os import os
import shutil import shutil
import json as pyjson
from pathlib import Path from pathlib import Path
from itertools import chain from itertools import chain
@ -42,6 +41,7 @@ from .html import (
write_html_snapshot_details, write_html_snapshot_details,
) )
from .json import ( from .json import (
pyjson,
load_json_snapshot, load_json_snapshot,
write_json_snapshot_details, write_json_snapshot_details,
) )
@ -320,7 +320,7 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model
""" """
out_dir = out_dir or Path(snapshot.snapshot_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
existing_snapshot = load_json_snapshot_details(Path(out_dir)) existing_snapshot = load_json_snapshot(Path(out_dir))
if existing_snapshot: if existing_snapshot:
return merge_snapshots(existing_snapshot, snapshot) return merge_snapshots(existing_snapshot, snapshot)

View file

@ -24,6 +24,7 @@ from ..config import (
GIT_SHA, GIT_SHA,
FOOTER_INFO, FOOTER_INFO,
HTML_INDEX_FILENAME, HTML_INDEX_FILENAME,
SAVE_ARCHIVE_DOT_ORG,
) )
MAIN_INDEX_TEMPLATE = 'static_index.html' MAIN_INDEX_TEMPLATE = 'static_index.html'
@ -97,11 +98,12 @@ def snapshot_details_template(snapshot: Model) -> str:
or (snapshot.domain if snapshot.is_archived else '') or (snapshot.domain if snapshot.is_archived else '')
) or 'about:blank', ) or 'about:blank',
'extension': snapshot.extension or 'html', 'extension': snapshot.extension or 'html',
'tags': snapshot.tags_str() or "untagged", 'tags': snapshot.tags_str() or 'untagged',
'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending', 'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending',
'status': 'archived' if snapshot.is_archived else 'not yet archived', 'status': 'archived' if snapshot.is_archived else 'not yet archived',
'status_color': 'success' if snapshot.is_archived else 'danger', 'status_color': 'success' if snapshot.is_archived else 'danger',
'oldest_archive_date': ts_to_date(snapshot.oldest_archive_date), 'oldest_archive_date': ts_to_date(snapshot.oldest_archive_date),
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
}) })
@enforce_types @enforce_types
@ -115,6 +117,8 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str: def snapshot_icons(snapshot) -> str:
from core.models import EXTRACTORS from core.models import EXTRACTORS
# start = datetime.now()
archive_results = snapshot.archiveresult_set.filter(status="succeeded") archive_results = snapshot.archiveresult_set.filter(status="succeeded")
path = snapshot.archive_path path = snapshot.archive_path
canon = snapshot.canonical_outputs() canon = snapshot.canonical_outputs()
@ -136,33 +140,45 @@ def snapshot_icons(snapshot) -> str:
exclude = ["favicon", "title", "headers", "archive_org"] exclude = ["favicon", "title", "headers", "archive_org"]
# Missing specific entry for WARC # Missing specific entry for WARC
extractor_items = defaultdict(lambda: None) extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTORS: for extractor, _ in EXTRACTORS:
for result in archive_results: for result in archive_results:
if result.extractor == extractor: if result.extractor == extractor and result:
extractor_items[extractor] = result extractor_outputs[extractor] = result
for extractor, _ in EXTRACTORS: for extractor, _ in EXTRACTORS:
if extractor not in exclude: if extractor not in exclude:
exists = False existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
if extractor_items[extractor] is not None: # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
outpath = (Path(path) / canon[f"{extractor}_path"]) # if existing:
if outpath.is_dir(): # existing = (Path(path) / existing)
exists = any(outpath.glob('*.*')) # if existing.is_file():
elif outpath.is_file(): # existing = True
exists = outpath.stat().st_size > 100 # elif existing.is_dir():
output += format_html(output_template, path, canon[f"{extractor}_path"], str(exists), # existing = any(existing.glob('*.*'))
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
extractor, icons.get(extractor, "?")) extractor, icons.get(extractor, "?"))
if extractor == "wget": if extractor == "wget":
# warc isn't technically it's own extractor, so we have to add it after wget # warc isn't technically it's own extractor, so we have to add it after wget
exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
output += format_html(output_template, exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) # get from db (faster but less thurthful)
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower but more accurate)
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
if extractor == "archive_org": if extractor == "archive_org":
# The check for archive_org is different, so it has to be handled separately # The check for archive_org is different, so it has to be handled separately
target_path = Path(path) / "archive.org.txt"
exists = target_path.exists() # get from db (faster)
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower)
# target_path = Path(path) / "archive.org.txt"
# exists = target_path.exists()
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists), output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
"archive_org", icons.get("archive_org", "?")) "archive_org", icons.get("archive_org", "?"))
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output)) result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
# end = datetime.now()
# print(((end - start).total_seconds()*1000) // 1, 'ms')
return result

View file

@ -413,6 +413,8 @@ class Link:
"""predict the expected output paths that should be present after archiving""" """predict the expected output paths that should be present after archiving"""
from ..extractors.wget import wget_output_path from ..extractors.wget import wget_output_path
# TODO: banish this awful duplication from the codebase and import these
# from their respective extractor files
canonical = { canonical = {
'index_path': 'index.html', 'index_path': 'index.html',
'favicon_path': 'favicon.ico', 'favicon_path': 'favicon.ico',
@ -428,6 +430,7 @@ class Link:
'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url), 'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
'git_path': 'git/', 'git_path': 'git/',
'media_path': 'media/', 'media_path': 'media/',
'headers_path': 'headers.json',
} }
if self.is_static: if self.is_static:
# static binary files like PDF and images are handled slightly differently. # static binary files like PDF and images are handled slightly differently.

View file

@ -54,9 +54,9 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Model]:
# look inside the URL for any sub-urls, e.g. for archive.org links # look inside the URL for any sub-urls, e.g. for archive.org links
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
for url in re.findall(URL_REGEX, line[1:]): for sub_url in re.findall(URL_REGEX, line[1:]):
yield Snapshot( yield Snapshot(
url=htmldecode(url), url=htmldecode(sub_url),
timestamp=str(datetime.now().timestamp()), timestamp=str(datetime.now().timestamp()),
title=None, title=None,
#tags=None, #tags=None,

View file

@ -47,7 +47,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
try: try:
tags = str_between(get_row('category'), 'label="', '" />') tags = str_between(get_row('category'), 'label="', '" />')
except: except Exception:
tags = None tags = None
yield Snapshot( yield Snapshot(

View file

@ -34,10 +34,11 @@ def get_indexable_content(results: QuerySet):
return [] return []
# This should come from a plugin interface # This should come from a plugin interface
# TODO: banish this duplication and get these from the extractor file
if method == 'readability': if method == 'readability':
return get_file_result_content(res, 'content.txt') return get_file_result_content(res, 'content.txt')
elif method == 'singlefile': elif method == 'singlefile':
return get_file_result_content(res, '') return get_file_result_content(res,'',use_pwd=True)
elif method == 'dom': elif method == 'dom':
return get_file_result_content(res,'',use_pwd=True) return get_file_result_content(res,'',use_pwd=True)
elif method == 'wget': elif method == 'wget':

View file

@ -33,7 +33,7 @@
} }
.nav > div { .nav > div {
min-height: 30px; min-height: 30px;
margin: 8px 0px; line-height: 1.3;
} }
.header-top a { .header-top a {
text-decoration: none; text-decoration: none;
@ -68,6 +68,11 @@
vertical-align: -2px; vertical-align: -2px;
margin-right: 4px; margin-right: 4px;
} }
.header-toggle {
line-height: 14px;
font-size: 70px;
vertical-align: -8px;
}
.info-row { .info-row {
margin-top: 2px; margin-top: 2px;
@ -76,24 +81,30 @@
.info-row .alert { .info-row .alert {
margin-bottom: 0px; margin-bottom: 0px;
} }
.card { .header-bottom-frames .card {
overflow: hidden; overflow: hidden;
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02); box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
margin-top: 10px; margin-top: 10px;
border: 1px solid rgba(0,0,0,3);
border-radius: 14px;
background-color: black;
} }
.card h4 { .card h4 {
font-size: 1.4vw; font-size: 1.4vw;
} }
.card-body { .card-body {
font-size: 1vw; font-size: 15px;
padding-top: 1.2vw; padding: 13px 10px;
padding-left: 1vw; padding-bottom: 6px;
padding-right: 1vw; /* padding-left: 3px; */
padding-bottom: 1vw; /* padding-right: 3px; */
/* padding-bottom: 3px; */
line-height: 1.1; line-height: 1.1;
word-wrap: break-word; word-wrap: break-word;
max-height: 102px; max-height: 102px;
overflow: hidden; overflow: hidden;
background-color: #1a1a1a;
color: #d3d3d3;
} }
.card-title { .card-title {
margin-bottom: 4px; margin-bottom: 4px;
@ -126,7 +137,7 @@
border-top: 3px solid #aa1e55; border-top: 3px solid #aa1e55;
} }
.card.selected-card { .card.selected-card {
border: 2px solid orange; border: 1px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05); box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
} }
.iframe-large { .iframe-large {
@ -174,12 +185,13 @@
width: 98%; width: 98%;
border: 1px solid rgba(0,0,0,0.2); border: 1px solid rgba(0,0,0,0.2);
box-shadow: 4px 4px 4px rgba(0,0,0,0.2); box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
margin-top: 5px; margin-top: 0px;
} }
.header-bottom-info { .header-bottom-info {
color: #6f6f6f; color: #6f6f6f;
padding-top: 8px; padding-top: 0px;
padding-bottom: 13px; padding-bottom: 0px;
margin: 0px -15px;
} }
.header-bottom-info > div { .header-bottom-info > div {
@ -203,12 +215,30 @@
margin-top: 5px; margin-top: 5px;
} }
.header-bottom-frames .card-title { .header-bottom-frames .card-title {
padding-bottom: 0px; width: 100%;
font-size: 1.2vw; text-align: center;
font-size: 18px;
margin-bottom: 5px; margin-bottom: 5px;
display: inline-block;
color: #d3d3d3;
font-weight: 200;
vertical-align: 0px;
margin-top: -6px;
} }
.header-bottom-frames .card-text { .header-bottom-frames .card-text {
width: 100%;
text-align: center;
font-size: 0.9em; font-size: 0.9em;
display: inline-block;
position: relative;
top: -11px;
}
.card-text code {
padding: .2rem .4rem;
font-size: 90%;
color: #bd4147;
background-color: #101010;
border-radius: .25rem;
} }
@media(max-width: 1092px) { @media(max-width: 1092px) {
@ -247,7 +277,7 @@
</a> </a>
</div> </div>
<div class="col-lg-8"> <div class="col-lg-8">
<img src="favicon.ico" alt="Favicon"> <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
&nbsp;&nbsp; &nbsp;&nbsp;
{{title}} {{title}}
&nbsp;&nbsp; &nbsp;&nbsp;
@ -316,120 +346,145 @@
</div> </div>
</div> </div>
<div class="row header-bottom-frames"> <div class="row header-bottom-frames">
<div class="col-lg-3"> <div class="col-lg-2">
<div class="card selected-card"> <div class="card selected-card">
<iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{archive_url}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="{{archive_url}}" target="preview"><h4 class="card-title">Wget &gt; WARC</h4></a>
<p class="card-text">archive/{{domain}}</p>
</div>
</div>
</div>
<div class="col-lg-3">
<div class="card">
<iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="{{singlefile_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <p class="card-text"><code>./singlefile.html</code></p>
</a> </a>
<a href="{{singlefile_path}}" target="preview"><h4 class="card-title">Chrome &gt; SingleFile</h4></a> <a href="{{singlefile_path}}" target="preview"><h4 class="card-title">Chrome &gt; SingleFile</h4></a>
<p class="card-text">archive/singlefile.html</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-3"> <div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{archive_org_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<p class="card-text">web.archive.org/web/...</p>
</div>
</div>
</div>
<div class="col-lg-3">
<div class="card">
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{url}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
<p class="card-text">{{domain}}</p>
</div>
</div>
</div>
<br/>
<div class="col-lg-3">
<div class="card"> <div class="card">
<iframe class="card-img-top pdf-frame" src="{{pdf_path}}" scrolling="no"></iframe> <iframe class="card-img-top pdf-frame" src="{{pdf_path}}" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="{{pdf_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="{{pdf_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <p class="card-text"><code>./output.pdf</code></p>
</a> </a>
<a href="{{pdf_path}}" target="preview" id="pdf-btn"><h4 class="card-title">Chrome &gt; PDF</h4></a> <a href="{{pdf_path}}" target="preview" id="pdf-btn"><h4 class="card-title">Chrome &gt; PDF</h4></a>
<p class="card-text">archive/output.pdf</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-3"> <div class="col-lg-2">
<div class="card"> <div class="card">
<img class="card-img-top screenshot" src="{{screenshot_path}}"></iframe> <img class="card-img-top" src="{{screenshot_path}}" onerror="this.style.opacity=0.2"/>
<div class="card-body"> <div class="card-body">
<a href="{{screenshot_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="{{screenshot_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <p class="card-text"><code>./screenshot.png</code></p>
</a> </a>
<a href="{{screenshot_path}}" target="preview"><h4 class="card-title">Chrome &gt; Screenshot</h4></a> <a href="{{screenshot_path}}" target="preview"><h4 class="card-title">Chrome &gt; Screenshot</h4></a>
<p class="card-text">archive/screenshot.png</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-3"> <div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./{{domain}}</code></p>
</a>
<a href="{{archive_url}}" target="preview"><h4 class="card-title">Wget &gt; HTML</h4></a>
</div>
</div>
</div>
{% if SAVE_ARCHIVE_DOT_ORG %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
</a>
<a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
</div>
</div>
</div>
{% endif %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>🌐 {{domain}}</code></p>
</a>
<a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./headers.json</code></p>
</a>
<a href="{{headers_path}}" target="preview"><h4 class="card-title">Headers</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="{{dom_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <p class="card-text"><code>./output.html</code></p>
</a> </a>
<a href="{{dom_path}}" target="preview"><h4 class="card-title">Chrome &gt; HTML</h4></a> <a href="{{dom_path}}" target="preview"><h4 class="card-title">Chrome &gt; HTML</h4></a>
<p class="card-text">archive/output.html</p>
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-3"> <div class="col-lg-2">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="{{readability_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <p class="card-text"><code>./readability/content.html</code></p>
</a> </a>
<a href="{{readability_path}}" target="preview"><h4 class="card-title">Readability</h4></a> <a href="{{readability_path}}" target="preview"><h4 class="card-title">Readability</h4></a>
<p class="card-text">archive/readability/...</p>
</div> </div>
</div> </div>
</div> </div>
<br/> <br/>
<div class="col-lg-3"> <div class="col-lg-2">
<div class="card"> <div class="card">
<iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe> <iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body"> <div class="card-body">
<a href="{{mercury_path}}" style="float:right" title="Open in new tab..." target="_blank" rel="noopener"> <a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/> <p class="card-text"><code>./mercury/content.html</code></p>
</a> </a>
<a href="{{mercury_path}}" target="preview"><h4 class="card-title">mercury</h4></a> <a href="{{mercury_path}}" target="preview"><h4 class="card-title">Mercury</h4></a>
<p class="card-text">archive/mercury/...</p> </div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./media/*.mp4</code></p>
</a>
<a href="{{media_path}}" target="preview"><h4 class="card-title">Media</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./git/*.git</code></p>
</a>
<a href="{{git_path}}" target="preview"><h4 class="card-title">Git</h4></a>
</div> </div>
</div> </div>
</div> </div>
</div> </div>
</div> </div>
</header> </header>
<iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{archive_url}}" name="preview"></iframe> <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
<script <script
src="https://code.jquery.com/jquery-3.2.1.slim.min.js" src="https://code.jquery.com/jquery-3.2.1.slim.min.js"

View file

@ -200,7 +200,13 @@ def get_headers(url: str, timeout: int=None) -> str:
stream=True stream=True
) )
return pyjson.dumps(dict(response.headers), indent=4) return pyjson.dumps(
{
'Status-Code': response.status_code,
**dict(response.headers),
},
indent=4,
)
@enforce_types @enforce_types