ArchiveBox/archivebox/core/utils.py

from django.utils.html import format_html

from core.models import Snapshot, EXTRACTORS
from pathlib import Path


def get_icons(snapshot: Snapshot) -> str:
    archive_results = snapshot.archiveresult_set.filter(status="succeeded")
    link = snapshot.as_link()
    canon = link.canonical_outputs()
    output = ""
    output_template = '<a href="/{}/{}" class="exists-True" title="{}">{} </a>'
    icons = {
        "singlefile": "❶",
        "wget": "🆆",
        "dom": "🅷",
        "pdf": "📄",
        "screenshot": "💻",
        "media": "📼",
        "git": "🅶",
        "archive_org": "🏛",
        "readability": "🆁",
        "mercury": "🅼",
        "warc": "📦"
    }
    exclude = ["favicon", "archive_org"]
    # Missing specific entry for WARC

    for extractor, _ in EXTRACTORS:
        for result in archive_results:
            if result.extractor != extractor:
                continue
            path = link.archive_path
            try:
                if extractor not in exclude:
                    output += output_template.format(path, canon[f"{extractor}_path"],
                                                     extractor, icons.get(extractor, "?"))
                if extractor == "wget":
                    # warc isn't technically it's own extractor, so we have to add it after wget
                    exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
                    if exists:
                        output += output_template.format(exists[0], "",
                                                         "warc", icons.get("warc", "?"))

                if extractor == "archive_org":
                    # The check for archive_org is different, so it has to be handled separately
                    target_path = Path(path) / "archive.org.txt"
                    exists = target_path.exists()
                    if exists:
                        output += '<a href="{}" class="exists-True" title="{}">{} </a>'.format(canon["archive_org_path"],
                                                                                               "archive_org", icons.get("archive_org", "?"))

            except Exception as e:
                print(e)

    return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')