ArchiveBox/archivebox/index/html.py

__package__ = 'archivebox.index'

import os

from string import Template
from datetime import datetime
from typing import List, Optional, Iterator, Mapping
from pathlib import Path

from .schema import Link
from ..system import atomic_write, copy_and_overwrite
from ..logging_util import printable_filesize
from ..util import (
    enforce_types,
    ts_to_date,
    urlencode,
    htmlencode,
    urldecode,
)
from ..config import (
    OUTPUT_DIR,
    TEMPLATES_DIR,
    VERSION,
    GIT_SHA,
    FOOTER_INFO,
    ARCHIVE_DIR_NAME,
    HTML_INDEX_FILENAME,
    STATIC_DIR_NAME,
    ROBOTS_TXT_FILENAME,
    FAVICON_FILENAME,
)

join = lambda *paths: os.path.join(*paths)
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
MINIMAL_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index_minimal.html')
MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
TITLE_LOADING_MSG = 'Not yet archived...'


### Main Links Index

@enforce_types
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
    """parse an archive index html file and return the list of urls"""

    index_path = join(out_dir, HTML_INDEX_FILENAME)
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            for line in f:
                if 'class="link-url"' in line:
                    yield line.split('"')[1]
    return ()

@enforce_types
def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None:
    """write the html link index to a given path"""

    copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME))
    copy_and_overwrite(join(TEMPLATES_DIR, ROBOTS_TXT_FILENAME), join(out_dir, ROBOTS_TXT_FILENAME))
    copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))
    
    rendered_html = main_index_template(links, finished=finished)
    atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)


@enforce_types
def main_index_template(links: List[Link], finished: bool=True, template: str=MAIN_INDEX_TEMPLATE) -> str:
    """render the template for the entire main index"""

    return render_legacy_template(template, {
        'version': VERSION,
        'git_sha': GIT_SHA,
        'num_links': str(len(links)),
        'status': 'finished' if finished else 'running',
        'date_updated': datetime.now().strftime('%Y-%m-%d'),
        'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'rows': '\n'.join(
            main_index_row_template(link)
            for link in links
        ),
        'footer_info': FOOTER_INFO,
    })


@enforce_types
def main_index_row_template(link: Link) -> str:
    """render the template for an individual link row of the main index"""

    from ..extractors.wget import wget_output_path

    return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, {
        **link._asdict(extended=True),
        
        # before pages are finished archiving, show loading msg instead of title
        'title': htmlencode(
            link.title
            or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
        ),

        # before pages are finished archiving, show fallback loading favicon
        'favicon_url': (
            join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
            # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
        ),

        # before pages are finished archiving, show the details page instead
        'wget_url': urlencode(wget_output_path(link) or 'index.html'),
        
        # replace commas in tags with spaces, or file extension if it's static
        'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
    })


### Link Details Index

@enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    out_dir = out_dir or link.link_dir

    rendered_html = link_details_template(link)
    atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)


@enforce_types
def link_details_template(link: Link) -> str:

    from ..extractors.wget import wget_output_path

    link_info = link._asdict(extended=True)

    return render_legacy_template(LINK_DETAILS_TEMPLATE, {
        **link_info,
        **link_info['canonical'],
        'title': htmlencode(
            link.title
            or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
        ),
        'url_str': htmlencode(urldecode(link.base_url)),
        'archive_url': urlencode(
            wget_output_path(link)
            or (link.domain if link.is_archived else '')
        ) or 'about:blank',
        'extension': link.extension or 'html',
        'tags': link.tags or 'untagged',
        'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
        'status': 'archived' if link.is_archived else 'not yet archived',
        'status_color': 'success' if link.is_archived else 'danger',
        'oldest_archive_date': ts_to_date(link.oldest_archive_date),
    })


@enforce_types
def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str:
    """render a given html template string with the given template content"""

    # will be replaced by django templates in the future
    with open(template_path, 'r', encoding='utf-8') as template:
        template_str = template.read()
    return Template(template_str).substitute(**context)
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`__package__ = 'archivebox.index'`
add package headers 2019-04-17 22:00:54 -04:00
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`import os`

split up utils into separate files 2019-04-30 23:13:04 -04:00			`from string import Template`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`from datetime import datetime`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from typing import List, Optional, Iterator, Mapping`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`from pathlib import Path`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`from .schema import Link`
split up utils into separate files 2019-04-30 23:13:04 -04:00			`from ..system import atomic_write, copy_and_overwrite`
show archive dir size in link details page 2020-08-18 09:17:37 -04:00			`from ..logging_util import printable_filesize`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00			`from ..util import (`
			`enforce_types,`
			`ts_to_date,`
			`urlencode,`
			`htmlencode,`
			`urldecode,`
			`)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`from ..config import (`
			`OUTPUT_DIR,`
			`TEMPLATES_DIR,`
			`VERSION,`
			`GIT_SHA,`
			`FOOTER_INFO,`
			`ARCHIVE_DIR_NAME,`
add pipenv, schedule cmd, logs dir, and lots more 2019-04-18 21:09:54 -04:00			`HTML_INDEX_FILENAME,`
use new config vars for html index writing 2019-04-24 04:10:02 -04:00			`STATIC_DIR_NAME,`
			`ROBOTS_TXT_FILENAME,`
			`FAVICON_FILENAME,`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`)`

			`join = lambda paths: os.path.join(paths)`
			`MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')`
feat: Add html export to list command 2020-08-19 13:02:12 -05:00			`MINIMAL_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index_minimal.html')`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')`
			`LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')`
			`TITLE_LOADING_MSG = 'Not yet archived...'`


			`### Main Links Index`

add rudimentary method to parse back html index into urls 2019-04-24 11:37:51 -04:00			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:`
add rudimentary method to parse back html index into urls 2019-04-24 11:37:51 -04:00			`"""parse an archive index html file and return the list of urls"""`

Merge branch 'master' into django 2020-06-25 21:30:29 -04:00			`index_path = join(out_dir, HTML_INDEX_FILENAME)`
add rudimentary method to parse back html index into urls 2019-04-24 11:37:51 -04:00			`if os.path.exists(index_path):`
			`with open(index_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`if 'class="link-url"' in line:`
			`yield line.split('"')[1]`
			`return ()`

better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`@enforce_types`
first attempt to migrate to Pathlib 2020-09-03 18:26:49 -04:00			`def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None:`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`"""write the html link index to a given path"""`

use new config vars for html index writing 2019-04-24 04:10:02 -04:00			`copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME))`
			`copy_and_overwrite(join(TEMPLATES_DIR, ROBOTS_TXT_FILENAME), join(out_dir, ROBOTS_TXT_FILENAME))`
			`copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00
			`rendered_html = main_index_template(links, finished=finished)`
Merge branch 'master' into django 2020-06-25 21:30:29 -04:00			`atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00

			`@enforce_types`
feat: Add html export to list command 2020-08-19 13:02:12 -05:00			`def main_index_template(links: List[Link], finished: bool=True, template: str=MAIN_INDEX_TEMPLATE) -> str:`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`"""render the template for the entire main index"""`

feat: Add html export to list command 2020-08-19 13:02:12 -05:00			`return render_legacy_template(template, {`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'version': VERSION,`
			`'git_sha': GIT_SHA,`
			`'num_links': str(len(links)),`
			`'status': 'finished' if finished else 'running',`
			`'date_updated': datetime.now().strftime('%Y-%m-%d'),`
			`'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),`
			`'rows': '\n'.join(`
			`main_index_row_template(link)`
			`for link in links`
			`),`
			`'footer_info': FOOTER_INFO,`
			`})`


			`@enforce_types`
			`def main_index_row_template(link: Link) -> str:`
			`"""render the template for an individual link row of the main index"""`

split up utils into separate files 2019-04-30 23:13:04 -04:00			`from ..extractors.wget import wget_output_path`

			`return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, {`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`**link._asdict(extended=True),`

			`# before pages are finished archiving, show loading msg instead of title`
fix: htmlencode titles before rendering the static html index and detail 2020-07-16 09:20:33 -05:00			`'title': htmlencode(`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`link.title`
			`or (link.base_url if link.is_archived else TITLE_LOADING_MSG)`
			`),`

			`# before pages are finished archiving, show fallback loading favicon`
			`'favicon_url': (`
			`join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')`
			`# if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='`
			`),`

			`# before pages are finished archiving, show the details page instead`
			`'wget_url': urlencode(wget_output_path(link) or 'index.html'),`

			`# replace commas in tags with spaces, or file extension if it's static`
			`'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),`
			`})`


			`### Link Details Index`

			`@enforce_types`
			`def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:`
			`out_dir = out_dir or link.link_dir`

			`rendered_html = link_details_template(link)`
Merge branch 'master' into django 2020-06-25 21:30:29 -04:00			`atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00

			`@enforce_types`
			`def link_details_template(link: Link) -> str:`

split up utils into separate files 2019-04-30 23:13:04 -04:00			`from ..extractors.wget import wget_output_path`

better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`link_info = link._asdict(extended=True)`

split up utils into separate files 2019-04-30 23:13:04 -04:00			`return render_legacy_template(LINK_DETAILS_TEMPLATE, {`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`**link_info,`
			`**link_info['canonical'],`
fix: htmlencode titles before rendering the static html index and detail 2020-07-16 09:20:33 -05:00			`'title': htmlencode(`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`link.title`
			`or (link.base_url if link.is_archived else TITLE_LOADING_MSG)`
			`),`
			`'url_str': htmlencode(urldecode(link.base_url)),`
			`'archive_url': urlencode(`
			`wget_output_path(link)`
fix pending titles and favicons, improve add page, custom admin 2020-07-27 23:26:45 -04:00			`or (link.domain if link.is_archived else '')`
			`) or 'about:blank',`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'extension': link.extension or 'html',`
			`'tags': link.tags or 'untagged',`
show archive dir size in link details page 2020-08-18 09:17:37 -04:00			`'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',`
better loading and saving storage mechanism 2019-04-17 02:25:28 -04:00			`'status': 'archived' if link.is_archived else 'not yet archived',`
			`'status_color': 'success' if link.is_archived else 'danger',`
			`'oldest_archive_date': ts_to_date(link.oldest_archive_date),`
			`})`
split up utils into separate files 2019-04-30 23:13:04 -04:00

			`@enforce_types`
			`def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str:`
			`"""render a given html template string with the given template content"""`

			`# will be replaced by django templates in the future`
			`with open(template_path, 'r', encoding='utf-8') as template:`
			`template_str = template.read()`
			`return Template(template_str).substitute(**context)`