ArchiveBox/archivebox/index/html.py

__package__ = 'archivebox.index'

from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
from typing import List, Optional, Iterator, Mapping

from django.utils.html import format_html, mark_safe
from django.core.cache import cache

from .schema import Link
from ..system import atomic_write
from ..logging_util import printable_filesize
from ..util import (
    enforce_types,
    ts_to_date_str,
    urlencode,
    htmlencode,
    urldecode,
)
from ..config import (
    OUTPUT_DIR,
    VERSION,
    FOOTER_INFO,
    HTML_INDEX_FILENAME,
    SAVE_ARCHIVE_DOT_ORG,
    PREVIEW_ORIGINALS,
)

MAIN_INDEX_TEMPLATE = 'static_index.html'
MINIMAL_INDEX_TEMPLATE = 'minimal_index.html'
LINK_DETAILS_TEMPLATE = 'snapshot.html'
TITLE_LOADING_MSG = 'Not yet archived...'


### Main Links Index

@enforce_types
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
    """parse an archive index html file and return the list of urls"""

    index_path = Path(out_dir) / HTML_INDEX_FILENAME
    if index_path.exists():
        with open(index_path, 'r', encoding='utf-8') as f:
            for line in f:
                if 'class="link-url"' in line:
                    yield line.split('"')[1]
    return ()

@enforce_types
def generate_index_from_links(links: List[Link], with_headers: bool):
    if with_headers:
        output = main_index_template(links)
    else:
        output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)
    return output

@enforce_types
def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
    """render the template for the entire main index"""

    return render_django_template(template, {
        'version': VERSION,
        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
        'num_links': str(len(links)),
        'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
        'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
        'links': [link._asdict(extended=True) for link in links],
        'FOOTER_INFO': FOOTER_INFO,
    })


### Link Details Index

@enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    out_dir = out_dir or link.link_dir

    rendered_html = link_details_template(link)
    atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)


@enforce_types
def link_details_template(link: Link) -> str:

    from ..extractors.wget import wget_output_path

    link_info = link._asdict(extended=True)

    return render_django_template(LINK_DETAILS_TEMPLATE, {
        **link_info,
        **link_info['canonical'],
        'title': htmlencode(
            link.title
            or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
        ),
        'url_str': htmlencode(urldecode(link.base_url)),
        'archive_url': urlencode(
            wget_output_path(link)
            or (link.domain if link.is_archived else '')
        ) or 'about:blank',
        'extension': link.extension or 'html',
        'tags': link.tags or 'untagged',
        'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
        'status': 'archived' if link.is_archived else 'not yet archived',
        'status_color': 'success' if link.is_archived else 'danger',
        'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
        'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
        'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
    })

@enforce_types
def render_django_template(template: str, context: Mapping[str, str]) -> str:
    """render a given html template string with the given template content"""
    from django.template.loader import render_to_string

    return render_to_string(template, context)


def snapshot_icons(snapshot) -> str:
    cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
    
    def calc_snapshot_icons():
        from core.models import EXTRACTORS
        # start = datetime.now(timezone.utc)

        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
        link = snapshot.as_link()
        path = link.archive_path
        canon = link.canonical_outputs()
        output = ""
        output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
        icons = {
            "singlefile": "❶",
            "wget": "🆆",
            "dom": "🅷",
            "pdf": "📄",
            "screenshot": "💻",
            "media": "📼",
            "git": "🅶",
            "archive_org": "🏛",
            "readability": "🆁",
            "mercury": "🅼",
            "warc": "📦"
        }
        exclude = ["favicon", "title", "headers", "archive_org"]
        # Missing specific entry for WARC

        extractor_outputs = defaultdict(lambda: None)
        for extractor, _ in EXTRACTORS:
            for result in archive_results:
                if result.extractor == extractor and result:
                    extractor_outputs[extractor] = result

        for extractor, _ in EXTRACTORS:
            if extractor not in exclude:
                existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
                # if existing:
                #     existing = (Path(path) / existing)
                #     if existing.is_file():
                #         existing = True
                #     elif existing.is_dir():
                #         existing = any(existing.glob('*.*'))
                output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
                                             extractor, icons.get(extractor, "?"))
            if extractor == "wget":
                # warc isn't technically it's own extractor, so we have to add it after wget
                
                # get from db (faster but less thurthful)
                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                # get from filesystem (slower but more accurate)
                # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
                output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))

            if extractor == "archive_org":
                # The check for archive_org is different, so it has to be handled separately

                # get from db (faster)
                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                # get from filesystem (slower)
                # target_path = Path(path) / "archive.org.txt"
                # exists = target_path.exists()
                output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
                                                                                            "archive_org", icons.get("archive_org", "?"))

        result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
        # end = datetime.now(timezone.utc)
        # print(((end - start).total_seconds()*1000) // 1, 'ms')
        return result

    return cache.get_or_set(cache_key, calc_snapshot_icons)
    # return calc_snapshot_icons()
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`__package__ = 'archivebox.index'`
add package headers 2019-04-17 22:00:54 -04:00
fix warc path in snapshot_icons 2021-02-16 06:18:05 -05:00			`from pathlib import Path`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 04:19:30 -04:00			`from datetime import datetime, timezone`
fix warc path in snapshot_icons 2021-02-16 06:18:05 -05:00			`from collections import defaultdict`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from typing import List, Optional, Iterator, Mapping`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
fix files icons escaping 2021-01-28 22:27:17 -05:00			`from django.utils.html import format_html, mark_safe`
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 15:49:29 -05:00			`from django.core.cache import cache`
remove redundant utils file 2020-11-28 02:12:27 -05:00
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`from .schema import Link`
remove redundant utils file 2020-11-28 02:12:27 -05:00			`from ..system import atomic_write`
show archive dir size in link details page 2020-08-18 09:17:37 -04:00			`from ..logging_util import printable_filesize`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`from ..util import (`
			`enforce_types,`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 04:19:30 -04:00			`ts_to_date_str,`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`urlencode,`
			`htmlencode,`
			`urldecode,`
			`)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`from ..config import (`
			`OUTPUT_DIR,`
			`VERSION,`
			`FOOTER_INFO,`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`HTML_INDEX_FILENAME,`
only show archive.org if enabled 2021-01-30 22:03:59 -05:00			`SAVE_ARCHIVE_DOT_ORG,`
add config option PREVIEW_ORIGINALS to hide original iframes in snapshot detail pages 2022-05-09 19:31:41 -07:00			`PREVIEW_ORIGINALS,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`)`

cleanup templates and views 2021-01-30 05:35:07 -05:00			`MAIN_INDEX_TEMPLATE = 'static_index.html'`
			`MINIMAL_INDEX_TEMPLATE = 'minimal_index.html'`
			`LINK_DETAILS_TEMPLATE = 'snapshot.html'`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`TITLE_LOADING_MSG = 'Not yet archived...'`


			`### Main Links Index`

add rudimentary method to parse back html index into urls 2019-04-24 11:37:51 -04:00			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:`
add rudimentary method to parse back html index into urls 2019-04-24 11:37:51 -04:00			`"""parse an archive index html file and return the list of urls"""`

Replaced os.path in html.py 2020-09-30 14:30:53 -05:00			`index_path = Path(out_dir) / HTML_INDEX_FILENAME`
			`if index_path.exists():`
add rudimentary method to parse back html index into urls 2019-04-24 11:37:51 -04:00			`with open(index_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`if 'class="link-url"' in line:`
			`yield line.split('"')[1]`
			`return ()`

feat: move import 2020-11-28 13:11:15 -05:00			`@enforce_types`
refactor: Move indexing logic out of logging module 2020-11-28 12:28:39 -05:00			`def generate_index_from_links(links: List[Link], with_headers: bool):`
			`if with_headers:`
remove unused argument 2020-11-28 12:38:15 -05:00			`output = main_index_template(links)`
refactor: Move indexing logic out of logging module 2020-11-28 12:28:39 -05:00			`else:`
remove unused argument 2020-11-28 12:38:15 -05:00			`output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)`
feat: move import 2020-11-28 13:11:15 -05:00			`return output`
refactor: Move indexing logic out of logging module 2020-11-28 12:28:39 -05:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`@enforce_types`
remove finished/not finished spinners 2020-11-28 01:07:02 -05:00			`def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`"""render the template for the entire main index"""`

Replace legacy templates for django templates 2020-12-02 16:56:16 -05:00			`return render_django_template(template, {`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'version': VERSION,`
remove unused GIT_SHA config option 2021-02-15 20:42:33 -05:00			`'git_sha': VERSION, # not used anymore, but kept for backwards compatibility`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'num_links': str(len(links)),`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 04:19:30 -04:00			`'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),`
			`'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),`
Replace legacy templates for django templates 2020-12-02 16:56:16 -05:00			`'links': [link._asdict(extended=True) for link in links],`
Use uppercase for constants 2020-12-03 15:44:59 -05:00			`'FOOTER_INFO': FOOTER_INFO,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`})`


			`### Link Details Index`

			`@enforce_types`
			`def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:`
			`out_dir = out_dir or link.link_dir`

			`rendered_html = link_details_template(link)`
Replaced os.path in html.py 2020-09-30 14:30:53 -05:00			`atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00

			`@enforce_types`
			`def link_details_template(link: Link) -> str:`

split up utils into separate files 2019-04-30 23:13:04 -04:00			`from ..extractors.wget import wget_output_path`

better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`link_info = link._asdict(extended=True)`

Replace legacy templates for django templates 2020-12-02 16:56:16 -05:00			`return render_django_template(LINK_DETAILS_TEMPLATE, {`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`**link_info,`
			`**link_info['canonical'],`
fix: htmlencode titles before rendering the static html index and detail 2020-07-16 09:20:33 -05:00			`'title': htmlencode(`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`link.title`
			`or (link.base_url if link.is_archived else TITLE_LOADING_MSG)`
			`),`
			`'url_str': htmlencode(urldecode(link.base_url)),`
			`'archive_url': urlencode(`
			`wget_output_path(link)`
fix pending titles and favicons, improve add page, custom admin 2020-07-27 23:26:45 -04:00			`or (link.domain if link.is_archived else '')`
			`) or 'about:blank',`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'extension': link.extension or 'html',`
			`'tags': link.tags or 'untagged',`
tweak icons 2020-10-31 19:32:43 -04:00			`'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'status': 'archived' if link.is_archived else 'not yet archived',`
			`'status_color': 'success' if link.is_archived else 'danger',`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 04:19:30 -04:00			`'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),`
only show archive.org if enabled 2021-01-30 22:03:59 -05:00			`'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,`
add config option PREVIEW_ORIGINALS to hide original iframes in snapshot detail pages 2022-05-09 19:31:41 -07:00			`'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`})`
split up utils into separate files 2019-04-30 23:13:04 -04:00
			`@enforce_types`
Replace legacy templates for django templates 2020-12-02 16:56:16 -05:00			`def render_django_template(template: str, context: Mapping[str, str]) -> str:`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`"""render a given html template string with the given template content"""`
Replace legacy templates for django templates 2020-12-02 16:56:16 -05:00			`from django.template.loader import render_to_string`
split up utils into separate files 2019-04-30 23:13:04 -04:00
Replace legacy templates for django templates 2020-12-02 16:56:16 -05:00			`return render_to_string(template, context)`
remove redundant utils file 2020-11-28 02:12:27 -05:00

			`def snapshot_icons(snapshot) -> str:`
fix snapshot icon caching and ordering 2021-04-01 02:22:15 -04:00			`cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'`
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 15:49:29 -05:00
			`def calc_snapshot_icons():`
			`from core.models import EXTRACTORS`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 04:19:30 -04:00			`# start = datetime.now(timezone.utc)`
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 15:49:29 -05:00
			`archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)`
			`link = snapshot.as_link()`
			`path = link.archive_path`
			`canon = link.canonical_outputs()`
			`output = ""`
			`output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>  '`
			`icons = {`
			`"singlefile": "❶",`
			`"wget": "🆆",`
			`"dom": "🅷",`
			`"pdf": "📄",`
			`"screenshot": "💻",`
			`"media": "📼",`
			`"git": "🅶",`
			`"archive_org": "🏛",`
			`"readability": "🆁",`
			`"mercury": "🅼",`
			`"warc": "📦"`
			`}`
			`exclude = ["favicon", "title", "headers", "archive_org"]`
			`# Missing specific entry for WARC`

			`extractor_outputs = defaultdict(lambda: None)`
			`for extractor, _ in EXTRACTORS:`
			`for result in archive_results:`
			`if result.extractor == extractor and result:`
			`extractor_outputs[extractor] = result`

			`for extractor, _ in EXTRACTORS:`
			`if extractor not in exclude:`
			`existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output`
			`# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)`
			`# if existing:`
			`# existing = (Path(path) / existing)`
			`# if existing.is_file():`
			`# existing = True`
			`# elif existing.is_dir():`
			`# existing = any(existing.glob('.'))`
			`output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),`
			`extractor, icons.get(extractor, "?"))`
			`if extractor == "wget":`
			`# warc isn't technically it's own extractor, so we have to add it after wget`

			`# get from db (faster but less thurthful)`
			`exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output`
			`# get from filesystem (slower but more accurate)`
			`# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))`
			`output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))`

			`if extractor == "archive_org":`
			`# The check for archive_org is different, so it has to be handled separately`

			`# get from db (faster)`
			`exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output`
			`# get from filesystem (slower)`
			`# target_path = Path(path) / "archive.org.txt"`
			`# exists = target_path.exists()`
			`output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),`
			`"archive_org", icons.get("archive_org", "?"))`

			`result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 04:19:30 -04:00			`# end = datetime.now(timezone.utc)`
cache dir size, snapshot icons, tags str, and title in django cache 2021-02-16 15:49:29 -05:00			`# print(((end - start).total_seconds()*1000) // 1, 'ms')`
			`return result`

			`return cache.get_or_set(cache_key, calc_snapshot_icons)`
			`# return calc_snapshot_icons()`
fix warc path in snapshot_icons 2021-02-16 06:18:05 -05:00