2019-04-27 21:26:24 +00:00
|
|
|
__package__ = 'archivebox.index'
|
2019-04-18 02:00:54 +00:00
|
|
|
|
2019-04-17 06:25:28 +00:00
|
|
|
from datetime import datetime
|
2019-05-01 03:13:04 +00:00
|
|
|
from typing import List, Optional, Iterator, Mapping
|
2020-09-03 22:26:49 +00:00
|
|
|
from pathlib import Path
|
2019-04-17 06:25:28 +00:00
|
|
|
|
2021-01-29 03:27:17 +00:00
|
|
|
from django.utils.html import format_html, mark_safe
|
2020-11-28 07:12:27 +00:00
|
|
|
from collections import defaultdict
|
|
|
|
|
2019-04-27 21:26:24 +00:00
|
|
|
from .schema import Link
|
2020-11-28 07:12:27 +00:00
|
|
|
from ..system import atomic_write
|
2020-08-18 13:17:37 +00:00
|
|
|
from ..logging_util import printable_filesize
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import (
|
|
|
|
enforce_types,
|
|
|
|
ts_to_date,
|
|
|
|
urlencode,
|
|
|
|
htmlencode,
|
|
|
|
urldecode,
|
|
|
|
)
|
2019-04-17 06:25:28 +00:00
|
|
|
from ..config import (
|
|
|
|
OUTPUT_DIR,
|
|
|
|
VERSION,
|
|
|
|
GIT_SHA,
|
|
|
|
FOOTER_INFO,
|
2019-04-19 01:09:54 +00:00
|
|
|
HTML_INDEX_FILENAME,
|
2019-04-17 06:25:28 +00:00
|
|
|
)
|
|
|
|
|
2020-12-02 21:56:16 +00:00
|
|
|
MAIN_INDEX_TEMPLATE = 'main_index.html'
|
|
|
|
MINIMAL_INDEX_TEMPLATE = 'main_index_minimal.html'
|
|
|
|
LINK_DETAILS_TEMPLATE = 'link_details.html'
|
2019-04-17 06:25:28 +00:00
|
|
|
TITLE_LOADING_MSG = 'Not yet archived...'
|
|
|
|
|
|
|
|
|
|
|
|
### Main Links Index
|
|
|
|
|
2019-04-24 15:37:51 +00:00
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
|
2019-04-24 15:37:51 +00:00
|
|
|
"""parse an archive index html file and return the list of urls"""
|
|
|
|
|
2020-09-30 19:30:53 +00:00
|
|
|
index_path = Path(out_dir) / HTML_INDEX_FILENAME
|
|
|
|
if index_path.exists():
|
2019-04-24 15:37:51 +00:00
|
|
|
with open(index_path, 'r', encoding='utf-8') as f:
|
|
|
|
for line in f:
|
|
|
|
if 'class="link-url"' in line:
|
|
|
|
yield line.split('"')[1]
|
|
|
|
return ()
|
|
|
|
|
2020-11-28 18:11:15 +00:00
|
|
|
@enforce_types
|
2020-11-28 17:28:39 +00:00
|
|
|
def generate_index_from_links(links: List[Link], with_headers: bool):
|
|
|
|
if with_headers:
|
2020-11-28 17:38:15 +00:00
|
|
|
output = main_index_template(links)
|
2020-11-28 17:28:39 +00:00
|
|
|
else:
|
2020-11-28 17:38:15 +00:00
|
|
|
output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)
|
2020-11-28 18:11:15 +00:00
|
|
|
return output
|
2020-11-28 17:28:39 +00:00
|
|
|
|
2019-04-17 06:25:28 +00:00
|
|
|
@enforce_types
|
2020-11-28 06:07:02 +00:00
|
|
|
def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
|
2019-04-17 06:25:28 +00:00
|
|
|
"""render the template for the entire main index"""
|
|
|
|
|
2020-12-02 21:56:16 +00:00
|
|
|
return render_django_template(template, {
|
2019-04-17 06:25:28 +00:00
|
|
|
'version': VERSION,
|
|
|
|
'git_sha': GIT_SHA,
|
|
|
|
'num_links': str(len(links)),
|
|
|
|
'date_updated': datetime.now().strftime('%Y-%m-%d'),
|
|
|
|
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
|
2020-12-02 21:56:16 +00:00
|
|
|
'links': [link._asdict(extended=True) for link in links],
|
2020-12-03 20:44:59 +00:00
|
|
|
'FOOTER_INFO': FOOTER_INFO,
|
2019-04-17 06:25:28 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
### Link Details Index
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
|
|
|
out_dir = out_dir or link.link_dir
|
|
|
|
|
|
|
|
rendered_html = link_details_template(link)
|
2020-09-30 19:30:53 +00:00
|
|
|
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
|
2019-04-17 06:25:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
def link_details_template(link: Link) -> str:
|
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
from ..extractors.wget import wget_output_path
|
|
|
|
|
2019-04-17 06:25:28 +00:00
|
|
|
link_info = link._asdict(extended=True)
|
|
|
|
|
2020-12-02 21:56:16 +00:00
|
|
|
return render_django_template(LINK_DETAILS_TEMPLATE, {
|
2019-04-17 06:25:28 +00:00
|
|
|
**link_info,
|
|
|
|
**link_info['canonical'],
|
2020-07-16 14:20:33 +00:00
|
|
|
'title': htmlencode(
|
2019-04-17 06:25:28 +00:00
|
|
|
link.title
|
|
|
|
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
|
|
|
),
|
|
|
|
'url_str': htmlencode(urldecode(link.base_url)),
|
|
|
|
'archive_url': urlencode(
|
|
|
|
wget_output_path(link)
|
2020-07-28 03:26:45 +00:00
|
|
|
or (link.domain if link.is_archived else '')
|
|
|
|
) or 'about:blank',
|
2019-04-17 06:25:28 +00:00
|
|
|
'extension': link.extension or 'html',
|
|
|
|
'tags': link.tags or 'untagged',
|
2020-10-31 23:32:43 +00:00
|
|
|
'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
|
2019-04-17 06:25:28 +00:00
|
|
|
'status': 'archived' if link.is_archived else 'not yet archived',
|
|
|
|
'status_color': 'success' if link.is_archived else 'danger',
|
|
|
|
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
|
|
|
|
})
|
2019-05-01 03:13:04 +00:00
|
|
|
|
|
|
|
@enforce_types
|
2020-12-02 21:56:16 +00:00
|
|
|
def render_django_template(template: str, context: Mapping[str, str]) -> str:
|
2019-05-01 03:13:04 +00:00
|
|
|
"""render a given html template string with the given template content"""
|
2020-12-02 21:56:16 +00:00
|
|
|
from django.template.loader import render_to_string
|
2019-05-01 03:13:04 +00:00
|
|
|
|
2020-12-02 21:56:16 +00:00
|
|
|
return render_to_string(template, context)
|
2020-11-28 07:12:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
def snapshot_icons(snapshot) -> str:
|
2020-11-28 09:09:59 +00:00
|
|
|
from core.models import EXTRACTORS
|
2020-11-28 07:12:27 +00:00
|
|
|
|
|
|
|
archive_results = snapshot.archiveresult_set.filter(status="succeeded")
|
|
|
|
link = snapshot.as_link()
|
|
|
|
path = link.archive_path
|
|
|
|
canon = link.canonical_outputs()
|
|
|
|
output = ""
|
2021-01-29 03:57:12 +00:00
|
|
|
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
2020-11-28 07:12:27 +00:00
|
|
|
icons = {
|
|
|
|
"singlefile": "❶",
|
|
|
|
"wget": "🆆",
|
|
|
|
"dom": "🅷",
|
|
|
|
"pdf": "📄",
|
|
|
|
"screenshot": "💻",
|
|
|
|
"media": "📼",
|
|
|
|
"git": "🅶",
|
|
|
|
"archive_org": "🏛",
|
|
|
|
"readability": "🆁",
|
|
|
|
"mercury": "🅼",
|
|
|
|
"warc": "📦"
|
|
|
|
}
|
|
|
|
exclude = ["favicon", "title", "headers", "archive_org"]
|
|
|
|
# Missing specific entry for WARC
|
|
|
|
|
|
|
|
extractor_items = defaultdict(lambda: None)
|
|
|
|
for extractor, _ in EXTRACTORS:
|
|
|
|
for result in archive_results:
|
|
|
|
if result.extractor == extractor:
|
|
|
|
extractor_items[extractor] = result
|
|
|
|
|
|
|
|
for extractor, _ in EXTRACTORS:
|
|
|
|
if extractor not in exclude:
|
|
|
|
exists = extractor_items[extractor] is not None
|
2021-01-29 03:27:17 +00:00
|
|
|
output += format_html(output_template, path, canon[f"{extractor}_path"], str(exists),
|
2020-11-28 07:12:27 +00:00
|
|
|
extractor, icons.get(extractor, "?"))
|
|
|
|
if extractor == "wget":
|
|
|
|
# warc isn't technically it's own extractor, so we have to add it after wget
|
|
|
|
exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
2021-01-29 03:27:17 +00:00
|
|
|
output += format_html(output_template, exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
2020-11-28 07:12:27 +00:00
|
|
|
|
|
|
|
if extractor == "archive_org":
|
|
|
|
# The check for archive_org is different, so it has to be handled separately
|
|
|
|
target_path = Path(path) / "archive.org.txt"
|
|
|
|
exists = target_path.exists()
|
|
|
|
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
|
|
|
|
"archive_org", icons.get("archive_org", "?"))
|
|
|
|
|
2021-01-29 05:15:28 +00:00
|
|
|
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|