refactoring and fancy new link index

2025-02-22 08:18:28 +00:00 · 2017-10-23 04:58:41 -05:00 · 2017-10-23 04:58:41 -05:00 · a95912679e
commit a95912679e
parent 1249493fcd
7 changed files with 295 additions and 174 deletions
--- a/index.py
+++ b/index.py
@ -1,5 +1,4 @@
 import os
-import re
 import json

 from datetime import datetime
@ -14,20 +13,15 @@ from config import (
    ANSI,
    GIT_SHA,
 )
-from util import chmod_file
+from util import (
+    chmod_file,
+    html_appended_url,
+    derived_link_info,
+)


 ### Homepage index for all the links

-def parse_json_links_index(out_dir):
-    """load the index in a given directory and merge it with the given link"""
-    index_path = os.path.join(out_dir, 'index.json')
-    if os.path.exists(index_path):
-        with open(index_path, 'r', encoding='utf-8') as f:
-            return json.load(f)['links']
-
-    return []
-
 def write_links_index(out_dir, links):
    """create index.html file for a given list of links"""

@ -44,8 +38,6 @@ def write_links_index(out_dir, links):
    write_json_links_index(out_dir, links)
    write_html_links_index(out_dir, links)

-    chmod_file(out_dir, permissions=ARCHIVE_PERMISSIONS)
-
 def write_json_links_index(out_dir, links):
    """write the json link index to a given path"""

@ -65,6 +57,15 @@ def write_json_links_index(out_dir, links):

    chmod_file(path)

+def parse_json_links_index(out_dir):
+    """load the index in a given directory and merge it with the given link"""
+    index_path = os.path.join(out_dir, 'index.json')
+    if os.path.exists(index_path):
+        with open(index_path, 'r', encoding='utf-8') as f:
+            return json.load(f)['links']
+
+    return []
+
 def write_html_links_index(out_dir, links):
    """write the html link index to a given path"""

@ -91,17 +92,11 @@ def write_html_links_index(out_dir, links):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(index_html).substitute(**template_vars))

+    chmod_file(path)
+

 ### Individual link index

-def parse_json_link_index(out_dir):
-    """load the index in a given directory and merge it with the given link"""
-    existing_index = os.path.join(out_dir, 'index.json')
-    if os.path.exists(existing_index):
-        with open(existing_index, 'r', encoding='utf-8') as f:
-            return json.load(f)
-    return {}
-
 def write_link_index(out_dir, link):
    link['updated'] = str(datetime.now().timestamp())
    write_json_link_index(out_dir, link)
@ -112,85 +107,39 @@ def write_json_link_index(out_dir, link):
    
    path = os.path.join(out_dir, 'index.json')

+    print('    √ Updating: index.json')
+
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(link, f, indent=4, default=str)

    chmod_file(path)

+def parse_json_link_index(out_dir):
+    """load the json link index from a given directory"""
+    existing_index = os.path.join(out_dir, 'index.json')
+    if os.path.exists(existing_index):
+        with open(existing_index, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return {}
+
 def write_html_link_index(out_dir, link):
    with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

+    print('    √ Updating: index.html')
+
    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **link,
-            **link['methods'],
+            **link['latest'],
            'type': link['type'] or 'website',
-            'tags': link['tags'] or '',
+            'tags': link['tags'] or 'untagged',
            'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
            'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
-            'archive_org': link['methods']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']),
-            'wget': link['methods']['wget'] or link['domain'],
+            'archive_org': link['latest']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']),
+            'wget': link['latest']['wget'] or link['domain'],
        }))

    chmod_file(path)
-
-
-
-def html_appended_url(link):
-    """calculate the path to the wgetted .html file, since wget may
-    adjust some paths to be different than the base_url path.
-
-    See docs on wget --adjust-extension."""
-
-    if link['type'] in ('PDF', 'image'):
-        return link['base_url']
-
-    split_url = link['url'].split('#', 1)
-    query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
-
-    if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
-        # already ends in .html
-        return link['base_url']
-    else:
-        # .html needs to be appended
-        without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
-        if without_scheme.endswith('/'):
-            if query:
-                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
-            return '#'.join([without_scheme + 'index.html', *split_url[1:]])
-        else:
-            if query:
-                return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
-            elif '/' in without_scheme:
-                return '#'.join([without_scheme + '.html', *split_url[1:]])
-            return link['base_url'] + '/index.html'
-
-
-def derived_link_info(link):
-    """extend link info with the archive urls and other derived data"""
-
-    link_info = {
-        **link,
-        'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
-        'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
-        'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
-        'files_url': 'archive/{timestamp}/'.format(**link),
-        'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
-        'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
-        'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
-        'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
-    }
-
-    # PDF and images are handled slightly differently
-    # wget, screenshot, & pdf urls all point to the same file
-    if link['type'] in ('PDF', 'image'):
-        link_info.update({
-            'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
-            'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'title': '{title} ({type})'.format(**link),
-        })
-    return link_info
--- a/links.py
+++ b/links.py
@ -1,18 +1,11 @@
-from util import (
-    domain,
-    base_url,
-    get_str_between,
-    get_link_type,
-)
-   
 """
 In Bookmark Archiver, a Link represents a single entry that we track in the 
 json index.  All links pass through all archiver functions and the latest,
-most up-to-date canonical output for each is stored in "latest_archives".
-.
+most up-to-date canonical output for each is stored in "latest".
+

 Link {
-    timestamp: float,   (how we uniquely id links)        _   _  _ _  ___
+    timestamp: str,     (how we uniquely id links)        _   _  _ _  ___
    url: str,                                            | \ / \ |\| ' |
    base_url: str,                                       |_/ \_/ | |   |
    domain: str,                                          _   _ _ _ _  _
@ -20,7 +13,7 @@ Link {
    type: str,                                           |  /"| | | | \_,
    title: str,                                              ,-'"`-.
    sources: [str],                                     /// /  @ @  \ \\\\
-    latest_archives: {                                    :=| ,._,. |=:  /
+    latest: {                                           \ :=| ,._,. |=:  /
        ...,                                            || ,\ \_../ /. ||
        pdf: 'output.pdf',                              ||','`-._))'`.`||
        wget: 'example.com/1234/index.html'             `-'     (/    `-'
@ -39,10 +32,18 @@ Link {

 """

+from util import (
+    domain,
+    base_url,
+    get_str_between,
+    get_link_type,
+)
+
+
 def validate_links(links):
-    links = valid_links(links)       # remove chrome://, about:, mailto: etc.
-    links = uniquefied_links(links)  # fix duplicate timestamps, returns sorted list
-    links = sorted_links(links)      # deterministically sort the links
+    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
+    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
+    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
    
    if not links:
        print('[X] No links found :(')
@ -50,34 +51,14 @@ def validate_links(links):

    return list(links)

-def sorted_links(links):
-    return sorted(
-        links,
-        key=lambda link: (link['timestamp'], link['url']),
-        reverse=True,
-    )

-def merge_links(link1, link2):
-    """deterministially merge two links, favoring longer field values over shorter,
-    and "cleaner" values over worse ones.
-    """
-    longer = lambda a, b, key: a[key] if len(a[key]) > len(b[key]) else b[key]
-    earlier = lambda a, b, key: a[key] if a[key] < b[key] else b[key]
-    
-    url = longer(link1, link2, 'url')
-    longest_title = longer(link1, link2, 'title')
-    cleanest_title = link1['title'] if '://' not in link1['title'] else link2['title']
-    link = {
-        'url': url,
-        'domain': domain(url),
-        'base_url': base_url(url),
-        'timestamp': earlier(link1, link2, 'timestamp'),
-        'tags': longer(link1, link2, 'tags'),
-        'title': longest_title if '://' not in longest_title else cleanest_title,
-        'sources': list(set(link1['sources'] + link2['sources'])),
-    }
-    link['type'] = get_link_type(link)
-    return link
+def archivable_links(links):
+    """remove chrome://, about:// or other schemed links that cant be archived"""
+    return (
+        link
+        for link in links
+        if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
+    )

 def uniquefied_links(sorted_links):
    """
@ -104,13 +85,33 @@ def uniquefied_links(sorted_links):

    return unique_timestamps.values()

-def valid_links(links):
-    """remove chrome://, about:// or other schemed links that cant be archived"""
-    return (
-        link
-        for link in links
-        if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
-    )
+def sorted_links(links):
+    sort_func = lambda link: (link['timestamp'], link['url'])
+    return sorted(links, key=sort_func, reverse=True)
+
+
+
+def merge_links(a, b):
+    """deterministially merge two links, favoring longer field values over shorter,
+    and "cleaner" values over worse ones.
+    """
+    longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
+    earlier = lambda key: a[key] if a[key] < b[key] else b[key]
+    
+    url = longer('url')
+    longest_title = longer('title')
+    cleanest_title = a['title'] if '://' not in a['title'] else b['title']
+    link = {
+        'timestamp': earlier('timestamp'),
+        'url': url,
+        'domain': domain(url),
+        'base_url': base_url(url),
+        'tags': longer('tags'),
+        'title': longest_title if '://' not in longest_title else cleanest_title,
+        'sources': list(set(a.get('sources', []) + b.get('sources', []))),
+    }
+    link['type'] = get_link_type(link)
+    return link

 def links_after_timestamp(links, timestamp=None):
    if not timestamp:
--- a/parse.py
+++ b/parse.py
@ -1,32 +1,36 @@
+"""
+Everything related to parsing links from bookmark services.
+
+For a list of supported services, see the README.md.
+For examples of supported files see examples/.
+
+Parsed link schema: {
+    'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
+    'domain': 'example.com',
+    'base_url': 'example.com/example/',
+    'timestamp': '15442123124234',
+    'tags': 'abc,def',
+    'title': 'Example.com Page Title',
+    'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
+}
+"""
+
 import re
 import json
+
 from datetime import datetime

 from util import (
    domain,
    base_url,
-    get_str_between,
+    str_between,
    get_link_type,
 )


-def parse_export(path):
-    """parse a list of links dictionaries from a bookmark export file"""
-    
-    links = []
-    with open(path, 'r', encoding='utf-8') as file:
-        for service, parser_func in get_parsers().items():
-            # otherwise try all parsers until one works
-            try:
-                links += list(parser_func(file))
-                if links:
-                    break
-            except Exception as e:
-                pass
+def get_parsers(file):
+    """return all parsers that work on a given file, defaults to all of them"""

-    return links
-
-def get_parsers():
    return {
        'pocket': parse_pocket_export,
        'pinboard': parse_json_export,
@ -34,12 +38,32 @@ def get_parsers():
        'rss': parse_rss_export,
    }

+def parse_links(path):
+    """parse a list of links dictionaries from a bookmark export file"""
+    
+    links = []
+    with open(path, 'r', encoding='utf-8') as file:
+        for parser_func in get_parsers(file).values():
+            # otherwise try all parsers until one works
+            try:
+                links += list(parser_func(file))
+                if links:
+                    break
+            except (ValueError, TypeError):
+                # parser not supported on this file
+                pass
+
+    return links
+
+
 def parse_pocket_export(html_file):
    """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""

    html_file.seek(0)
-    pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)   # see sample input in ./example_ril_export.html
+    pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
    for line in html_file:
+        # example line
+        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
        match = pattern.search(line)
        if match:
            fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
@ -62,6 +86,8 @@ def parse_json_export(json_file):
    json_file.seek(0)
    json_content = json.load(json_file)
    for line in json_content:
+        # example line
+        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
        if line:
            erg = line
            time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')
@ -96,11 +122,12 @@ def parse_rss_export(rss_file):
        leading_removed = trailing_removed.split('<item>', 1)[-1]
        rows = leading_removed.split('\n')

-        row = lambda key: [r for r in rows if r.startswith('<{}>'.format(key))][0]
+        def get_row(key):
+            return [r for r in rows if r.startswith('<{}>'.format(key))][0]

-        title = get_str_between(row('title'), '<![CDATA[', ']]')
-        url = get_str_between(row('link'), '<link>', '</link>')
-        ts_str = get_str_between(row('pubDate'), '<pubDate>', '</pubDate>')
+        title = str_between(get_row('title'), '<![CDATA[', ']]')
+        url = str_between(get_row('link'), '<link>', '</link>')
+        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")

        info = {
@ -112,17 +139,20 @@ def parse_rss_export(rss_file):
            'title': title,
            'sources': [rss_file.name],
        }
-
        info['type'] = get_link_type(info)
-        # import ipdb; ipdb.set_trace()
+
        yield info

 def parse_bookmarks_export(html_file):
    """Parse netscape-format bookmarks export files (produced by all browsers)"""

+
    html_file.seek(0)
    pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
    for line in html_file:
+        # example line
+        # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
+        
        match = pattern.search(line)
        if match:
            url = match.group(1)
@ -137,6 +167,6 @@ def parse_bookmarks_export(html_file):
                'title': match.group(3),
                'sources': [html_file.name],
            }
-
            info['type'] = get_link_type(info)
+
            yield info
--- a/templates/index.html
+++ b/templates/index.html
@ -68,7 +68,7 @@
                <img src="https://nicksweeting.com/images/archive.png" height="36px">
                Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/>
                <small>
-                    Archived with: <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a> on $date_updated
+                    <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a>
                </small>
            </h1>
        </header>
--- a/templates/index_row.html
+++ b/templates/index_row.html
@ -4,7 +4,7 @@
        <img src="$favicon_url">
        $title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
    </td>
-    <td style="text-align:center"><a href="$files_url/index.html" title="Files">📂</a></td>
+    <td style="text-align:center"><a href="$files_url" title="Files">📂</a></td>
    <td style="text-align:center"><a href="$pdf_link" title="PDF">📄</a></td>
    <td style="text-align:center"><a href="$screenshot_link" title="Screenshot">🖼</a></td>
    <td style="text-align:center"><a href="$archive_org_url" title="Archive.org">🏛</a></td>
--- a/templates/link_index.html
+++ b/templates/link_index.html
@ -140,7 +140,7 @@
                <a href="#" class="collapse-icon" title="Collapse Navbar">
                    [-]
                </a>
-                <a href="../../../index.html" class="nav-icon" title="Archived Sites">
+                <a href="./../../index.html" class="nav-icon" title="Archived Sites">
                    <img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon">
                </a>
                $title<br/>
@ -221,6 +221,7 @@
    </body>

    <script>
+        // show selected file in iframe when preview card is clicked
        jQuery('.card').on('click', function(e) {
            jQuery('.selected-card').removeClass('selected-card')
            jQuery(e.target).closest('.card').addClass('selected-card')
@ -233,12 +234,16 @@
            }
            return true
        })
+
+        // un-sandbox iframes showing pdfs (required to display pdf viewer)
        jQuery('iframe').map(function() {
            if (this.src.endsWith('.pdf')) {
                this.removeAttribute('sandbox')
                this.src = this.src
            }
        })
+
+        // hide header when collapse icon is clicked
        jQuery('.collapse-icon').on('click', function() {
            if (jQuery('.collapse-icon').text().includes('[-]')) {
                jQuery('.collapse-icon').text('[+]')
@ -251,6 +256,8 @@
            }
            return true
        })
+
+        // hide all preview iframes on small screens
        if (window.innerWidth < 1091) {
            jQuery('.card a[target=preview]').attr('target', '_self')
        }
--- a/util.py
+++ b/util.py
@ -1,6 +1,8 @@
 import os
+import re
 import sys
 import time
+import json
 import requests

 from datetime import datetime
@ -24,6 +26,17 @@ from config import (
    SUBMIT_ARCHIVE_DOT_ORG,
 )

+# URL helpers
+without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
+without_query = lambda url: url.split('?', 1)[0]
+without_hash = lambda url: url.split('#', 1)[0]
+without_path = lambda url: url.split('/', 1)[0]
+domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
+base_url = lambda url: without_query(without_scheme(url))
+
+short_ts = lambda ts: ts.split('.')[0]
+
+
 def check_dependencies():
    """Check that all necessary dependencies are installed, and have valid versions"""

@ -149,11 +162,15 @@ def progress(seconds=TIMEOUT, prefix=''):


 def download_url(url):
-    if not os.path.exists(os.path.join(ARCHIVE_DIR, 'downloads')):
-        os.makedirs(os.path.join(ARCHIVE_DIR, 'downloads'))
+    """download a given url's content into downloads/domain.txt"""
+
+    download_dir = os.path.join(ARCHIVE_DIR, 'downloads')
+
+    if not os.path.exists(download_dir):
+        os.makedirs(download_dir)

    url_domain = url.split('/', 3)[2]
-    output_path = os.path.join(ARCHIVE_DIR, 'downloads', '{}.txt'.format(url_domain))
+    output_path = os.path.join(download_dir, '{}.txt'.format(url_domain))
    
    print('[*] [{}] Downloading {} > {}'.format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@ -172,10 +189,10 @@ def download_url(url):

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(downloaded_xml)
+
    return output_path

-
-def get_str_between(string, start, end=None):
+def str_between(string, start, end=None):
    """(<abc>12345</def>, <abc>, </def>)  ->  12345"""

    content = string.split(start, 1)[-1]
@ -184,9 +201,6 @@ def get_str_between(string, start, end=None):

    return content

-
-
-
 def get_link_type(link):
    """Certain types of links need to be handled specially, this figures out when that's the case"""

@ -207,10 +221,130 @@ def get_link_type(link):
    return None


-# URL helpers
-without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
-without_query = lambda url: url.split('?', 1)[0]
-without_hash = lambda url: url.split('#', 1)[0] 
-without_path = lambda url: url.split('/', 1)[0]
-domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
-base_url = lambda url: without_query(without_scheme(url))
+def find_link(folder, links):
+    """for a given archive folder, find the corresponding link object in links"""
+    url = parse_url(folder)
+    if url:
+        for link in links:
+            if (link['base_url'] in url) or (url in link['url']):
+                return link
+
+    timestamp = folder.split('.')[0]
+    for link in links:
+        if link['timestamp'].startswith(timestamp):
+            if link['domain'] in os.listdir('./html/archive/' + folder):
+                return link      # careful now, this isn't safe for most ppl
+            if link['domain'] in parse_url(folder):
+                return link
+    return None
+
+
+def parse_url(folder):
+    """for a given archive folder, figure out what url it's for"""
+    link_json = os.path.join('./html/archive/' + folder, 'index.json')
+    if os.path.exists(link_json):
+        with open(link_json, 'r') as f:
+            link = json.load(f)
+            return link['base_url']
+
+    archive_org_txt = os.path.join('./html/archive/' + folder, 'archive.org.txt')
+    if os.path.exists(archive_org_txt):
+        with open(archive_org_txt, 'r') as f:
+            original_link = f.read().strip().split('/http', 1)[-1]
+            with_scheme = 'http{}'.format(original_link)
+            return with_scheme
+
+    return ''
+
+
+def merge_folders(folder, link):
+    """given a folder, merge it to the canonical 'correct' path for the given link object"""
+    base_url = parse_url(folder)
+    if not (base_url in link['base_url']
+            or link['base_url'] in base_url):
+        print(base_url, link['base_url'])
+        assert False
+    print('{} > {}'.format(folder, link['timestamp']))
+
+
+def cleanup_archive(path, links):
+    """move any incorrectly named folders to their canonical locations"""
+    
+    # for each folder that exists, see if we can match it up with a known good link
+    # if we can, then merge the two folders, if not, move it to lost & found
+
+    # for each timestamp, find similar timestamped folders
+    # check each folder for a "domain.com" folder or 
+
+    unmatched = []
+
+    for folder in os.listdir(path):
+        link = find_link(folder, links)
+        if link is None:
+            unmatched.append(folder)
+            continue
+        
+        if folder != link['timestamp']:
+            merge_folders(folder, link)
+
+    if unmatched:
+        print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
+        print('\n    '.join(unmatched))
+
+
+def html_appended_url(link):
+    """calculate the path to the wgetted .html file, since wget may
+    adjust some paths to be different than the base_url path.
+
+    See docs on wget --adjust-extension.
+    """
+
+    if link['type'] in ('PDF', 'image'):
+        return link['base_url']
+
+    split_url = link['url'].split('#', 1)
+    query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
+
+    if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
+        # already ends in .html
+        return link['base_url']
+    else:
+        # .html needs to be appended
+        without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
+        if without_scheme.endswith('/'):
+            if query:
+                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
+            return '#'.join([without_scheme + 'index.html', *split_url[1:]])
+        else:
+            if query:
+                return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
+            elif '/' in without_scheme:
+                return '#'.join([without_scheme + '.html', *split_url[1:]])
+            return link['base_url'] + '/index.html'
+
+
+def derived_link_info(link):
+    """extend link info with the archive urls and other derived data"""
+
+    link_info = {
+        **link,
+        'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
+        'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
+        'favicon_url': './archive/{timestamp}/favicon.ico'.format(**link),
+        'files_url': './archive/{timestamp}/index.html'.format(**link),
+        'archive_url': './archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
+        'pdf_link': './archive/{timestamp}/output.pdf'.format(**link),
+        'screenshot_link': './archive/{timestamp}/screenshot.png'.format(**link),
+        'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
+    }
+
+    # PDF and images are handled slightly differently
+    # wget, screenshot, & pdf urls all point to the same file
+    if link['type'] in ('PDF', 'image'):
+        link_info.update({
+            'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
+            'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
+            'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
+            'title': '{title} ({type})'.format(**link),
+        })
+    return link_info