better link corruption guards, remove title prefetching, save index after run

2024-11-10 06:34:16 +00:00 · 2019-02-21 17:45:28 -05:00 · 2019-02-21 17:45:28 -05:00 · b03e9fade8
commit b03e9fade8
parent c95632883e
6 changed files with 165 additions and 93 deletions
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@ -7,34 +7,31 @@ import os
 import sys

 from datetime import datetime
-from subprocess import run
+from peekable import Peekable
+

 from parse import parse_links
-from links import validate_links
-from archive_methods import archive_links, _RESULTS_TOTALS
+from links import validate_links, links_after_timestamp
+from archive_methods import archive_link, _RESULTS_TOTALS
 from index import (
    write_links_index,
-    write_link_index,
    parse_json_links_index,
-    parse_json_link_index,
 )
 from config import (
+    ARCHIVE_DIR,
    ONLY_NEW,
-    OUTPUT_PERMISSIONS,
    OUTPUT_DIR,
    REPO_DIR,
    ANSI,
-    TIMEOUT,
-    SHOW_PROGRESS,
    GIT_SHA,
 )
 from util import (
+    check_dependencies,
    download_url,
    save_source,
-    progress,
-    cleanup_archive,
    pretty_path,
    migrate_data,
+    check_links_structure,
 )

 __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
@ -42,6 +39,7 @@ __VERSION__ = GIT_SHA
 __DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'

+
 def print_help():
    print(__DESCRIPTION__)
    print("Documentation:     {}\n".format(__DOCUMENTATION__))
@ -55,21 +53,22 @@ def print_help():

 def load_links(archive_path=OUTPUT_DIR, import_path=None):
    """get new links from file and optionally append them to links in existing archive"""
-    
+
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
+        check_links_structure(existing_links)

    new_links = []
    if import_path:
        # parse and validate the import file
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
-        if SHOW_PROGRESS:
-            print()
+        check_links_structure(new_links)

    # merge existing links in archive_path and new links
    all_links = validate_links(existing_links + new_links)
+    check_links_structure(all_links)
    num_new_links = len(all_links) - len(existing_links)

    if import_path and parser_name:
@ -81,6 +80,7 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):

    return all_links, new_links

+
 def update_archive(archive_path, links, source=None, resume=None, append=True):
    """update or create index.html+json given a path to an export file containing new links"""

@ -99,8 +99,38 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
             **ANSI,
        ))

+    check_links_structure(links)
+
+    # prefetch the first link off the generator so that if we pause or fail
+    # immediately we can show that we paused on the first link and not just None
+    to_archive = Peekable(links_after_timestamp(links, resume))
+    idx, link = 0, to_archive.peek(0)
+
    # loop over links and archive them
-    archive_links(archive_path, links, source=source, resume=resume)
+    try:
+        check_dependencies()
+        for idx, link in enumerate(to_archive):
+            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
+            archive_link(link_dir, link)
+
+    except (KeyboardInterrupt, SystemExit, Exception) as e:
+        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
+            **ANSI,
+            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            idx=idx+1,
+            timestamp=link['timestamp'],
+            total=len(links),
+        ))
+        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
+        print('    Continue where you left off by running:')
+        print('        {} {}'.format(
+            pretty_path(sys.argv[0]),
+            link['timestamp'],
+        ))
+        if not isinstance(e, KeyboardInterrupt):
+            print()
+            raise e
+        raise SystemExit(1)

    # print timing information & summary
    end_ts = datetime.now().timestamp()
@ -135,7 +165,7 @@ if __name__ == '__main__':
    source = sys.argv[1] if argc > 1 else None  # path of links file to import
    resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
   
-    stdin_raw_text = []
+    stdin_raw_text = ''

    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read()
@ -192,3 +222,7 @@ if __name__ == '__main__':
        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
    else:
        update_archive(out_dir, all_links, source=source, resume=resume, append=True)
+
+    # Step 5: Re-write links index with updated titles, icons, and resources
+    all_links, _ = load_links(archive_path=out_dir)
+    write_links_index(out_dir=out_dir, links=all_links)
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@ -1,16 +1,17 @@
 import os
-import re
-import sys

 from functools import wraps
 from collections import defaultdict
 from datetime import datetime

-from peekable import Peekable
-
-from index import wget_output_path, parse_json_link_index, write_link_index
-from links import links_after_timestamp
+from index import (
+    wget_output_path,
+    parse_json_link_index,
+    write_link_index,
+    patch_index_title_hack,
+)
 from config import (
+    OUTPUT_DIR,
    CURL_BINARY,
    GIT_BINARY,
    WGET_BINARY,
@ -42,12 +43,12 @@ from config import (
 )
 from util import (
    without_fragment,
-    check_dependencies,
    fetch_page_title,
    progress,
    chmod_file,
    pretty_path,
-    run, PIPE, DEVNULL
+    check_link_structure,
+    run, PIPE, DEVNULL,
 )


@ -57,38 +58,12 @@ _RESULTS_TOTALS = {   # globals are bad, mmkay
    'failed': 0,
 }

-def archive_links(archive_path, links, source=None, resume=None):
-    check_dependencies()
-
-    to_archive = Peekable(links_after_timestamp(links, resume))
-    idx, link = 0, to_archive.peek(0)
-
-    try:
-        for idx, link in enumerate(to_archive):
-            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
-            archive_link(link_dir, link)
-    
-    except (KeyboardInterrupt, SystemExit, Exception) as e:
-        print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
-            **ANSI,
-            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-            idx=idx+1,
-            timestamp=link['timestamp'],
-            total=len(links),
-        ))
-        print('    Continue where you left off by running:')
-        print('        {} {}'.format(
-            pretty_path(sys.argv[0]),
-            link['timestamp'],
-        ))
-        if not isinstance(e, KeyboardInterrupt):
-            raise e
-        raise SystemExit(1)
-

 def archive_link(link_dir, link, overwrite=True):
    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""

+    check_link_structure(link)
+
    try:
        update_existing = os.path.exists(link_dir)
        if update_existing:
@ -99,7 +74,7 @@ def archive_link(link_dir, link, overwrite=True):
        else:
            os.makedirs(link_dir)
        
-        log_link_archive(link_dir, link, update_existing)
+        print_link_status_line(link_dir, link, update_existing)

        if FETCH_FAVICON:
            link = fetch_favicon(link_dir, link, overwrite=overwrite)
@ -135,7 +110,7 @@ def archive_link(link_dir, link, overwrite=True):
    
    return link

-def log_link_archive(link_dir, link, update_existing):
+def print_link_status_line(link_dir, link, update_existing):
    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
        symbol='*' if update_existing else '+',
        symbol_color=ANSI['black' if update_existing else 'green'],
@ -518,7 +493,7 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):

    # if link already has valid title, skip it
    if link['title'] and not link['title'].lower().startswith('http'):
-        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
+        return {'output': link['title'], 'status': 'skipped'}

    end = progress(timeout, prefix='      ')
    try:
@ -530,6 +505,13 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
        output = e

+    # titles should show up in the global index immediatley for better UX,
+    # do a hacky immediate replacement to add them in as we're archiving
+    # TODO: figure out how to do this without gnarly string replacement
+    if title:
+        link['title'] = title
+        patch_index_title_hack(link['url'], title)
+
    return {
        'cmd': 'fetch_page_title("{}")'.format(link['url']),
        'output': output,
--- a/archivebox/index.py
+++ b/archivebox/index.py
@ -6,6 +6,7 @@ from string import Template
 from distutils.dir_util import copy_tree

 from config import (
+    OUTPUT_DIR,
    TEMPLATES_DIR,
    OUTPUT_PERMISSIONS,
    ANSI,
@ -17,6 +18,8 @@ from util import (
    wget_output_path,
    derived_link_info,
    pretty_path,
+    check_link_structure,
+    check_links_structure,
 )


@ -25,6 +28,8 @@ from util import (
 def write_links_index(out_dir, links):
    """create index.html file for a given list of links"""

+    check_links_structure(links)
+
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

@ -42,6 +47,8 @@ def write_links_index(out_dir, links):
 def write_json_links_index(out_dir, links):
    """write the json link index to a given path"""

+    check_links_structure(links)
+
    path = os.path.join(out_dir, 'index.json')

    index_json = {
@ -63,13 +70,17 @@ def parse_json_links_index(out_dir):
    index_path = os.path.join(out_dir, 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
-            return json.load(f)['links']
+            links = json.load(f)['links']
+            check_links_structure(links)
+            return links

    return []

 def write_html_links_index(out_dir, links):
    """write the html link index to a given path"""

+    check_links_structure(links)
+
    path = os.path.join(out_dir, 'index.html')

    copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
@ -104,6 +115,25 @@ def write_html_links_index(out_dir, links):
    chmod_file(path)


+def patch_index_title_hack(link_url, new_title):
+    """hack to update just one link's title in the link index json"""
+
+    json_path = os.path.join(OUTPUT_DIR, 'index.json')
+
+    links = parse_json_links_index(OUTPUT_DIR)
+
+    changed = False
+    for link in links:
+        if link['url'] == link_url:
+            link['title'] = new_title
+            changed = True
+            break
+
+    if changed:
+        write_json_links_index(OUTPUT_DIR, links)
+
+
+
 ### Individual link index

 def write_link_index(out_dir, link):
@ -114,6 +144,7 @@ def write_link_index(out_dir, link):
 def write_json_link_index(out_dir, link):
    """write a json file with some info about the link"""
    
+    check_link_structure(link)
    path = os.path.join(out_dir, 'index.json')

    print('      √ index.json')
@ -128,10 +159,13 @@ def parse_json_link_index(out_dir):
    existing_index = os.path.join(out_dir, 'index.json')
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
-            return json.load(f)
+            link_json = json.load(f)
+            check_link_structure(link_json)
+            return link_json
    return {}

 def write_html_link_index(out_dir, link):
+    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()

--- a/archivebox/links.py
+++ b/archivebox/links.py
@ -32,34 +32,33 @@ Link {

 """

-import datetime
 from html import unescape
 from collections import OrderedDict

 from util import (
-    domain,
-    base_url,
-    str_between,
-    get_link_type,
    merge_links,
    wget_output_path,
+    check_link_structure,
+    check_links_structure,
 )
-from config import ANSI


 def validate_links(links):
+    check_links_structure(links)
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
-    
+
    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)

    for link in links:
+        check_link_structure(link)
+
        link['title'] = unescape(link['title']) if link['title'] else None
        link['latest'] = link.get('latest') or {}
-        
+
        latest = link['latest']
        if not link['latest'].get('wget'):
            link['latest']['wget'] = wget_output_path(link)
@ -81,14 +80,16 @@ def validate_links(links):

    return list(links)

+
 def archivable_links(links):
    """remove chrome://, about:// or other schemed links that cant be archived"""
    return (
        link
        for link in links
-        if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
+        if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://'))
    )

+
 def uniquefied_links(sorted_links):
    """
    ensures that all non-duplicate links have monotonically increasing timestamps
@ -114,10 +115,12 @@ def uniquefied_links(sorted_links):

    return unique_timestamps.values()

+
 def sorted_links(links):
    sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
    return sorted(links, key=sort_func, reverse=True)

+
 def links_after_timestamp(links, timestamp=None):
    if not timestamp:
        yield from links
@ -130,6 +133,7 @@ def links_after_timestamp(links, timestamp=None):
        except (ValueError, TypeError):
            print('Resume value and all timestamp values must be valid numbers.')

+
 def lowest_uniq_timestamp(used_timestamps, timestamp):
    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""

--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@ -20,7 +20,6 @@ Parsed link schema: {
 import re
 import sys
 import json
-import urllib
 from collections import OrderedDict
 import xml.etree.ElementTree as etree

@ -32,7 +31,6 @@ from util import (
    base_url,
    str_between,
    get_link_type,
-    fetch_page_title,
    URL_REGEX,
 )

@ -56,13 +54,11 @@ def parse_links(path):
    
    links = []
    with open(path, 'r', encoding='utf-8') as file:
-        print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format(
+        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            path.rsplit('/', 1)[-1],
            **ANSI,
        ))
-        if SHOW_PROGRESS:
-            sys.stdout.write('    ')

        for parser_name, parser_func in get_parsers(file).items():
            # otherwise try all parsers until one works
@ -98,7 +94,7 @@ def parse_pocket_html_export(html_file):
                'base_url': base_url(fixed_url),
                'timestamp': str(time.timestamp()),
                'tags': match.group(3),
-                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or fetch_page_title(fixed_url),
+                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
                'sources': [html_file.name],
            }
            info['type'] = get_link_type(info)
@ -135,7 +131,7 @@ def parse_pinboard_json_export(json_file):
                'base_url': base_url(url),
                'timestamp': timestamp,
                'tags': erg.get('tags') or '',
-                'title': title or fetch_page_title(url),
+                'title': title or None,
                'sources': [json_file.name],
            }
            info['type'] = get_link_type(info)
@ -174,7 +170,7 @@ def parse_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -215,7 +211,7 @@ def parse_shaarli_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -242,7 +238,7 @@ def parse_netscape_html_export(html_file):
                'base_url': base_url(url),
                'timestamp': str(time.timestamp()),
                'tags': "",
-                'title': match.group(3).strip() or fetch_page_title(url),
+                'title': match.group(3).strip() or None,
                'sources': [html_file.name],
            }
            info['type'] = get_link_type(info)
@ -275,7 +271,7 @@ def parse_pinboard_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': tags,
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -300,7 +296,7 @@ def parse_medium_rss_export(rss_file):
            'base_url': base_url(url),
            'timestamp': str(time.timestamp()),
            'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
            'sources': [rss_file.name],
        }
        info['type'] = get_link_type(info)
@ -324,7 +320,7 @@ def parse_plain_text_export(text_file):
                    'base_url': base_url(url),
                    'timestamp': str(datetime.now().timestamp()),
                    'tags': '',
-                    'title': fetch_page_title(url),
+                    'title': None,
                    'sources': [text_file.name],
                }
                info['type'] = get_link_type(info)
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -3,8 +3,7 @@ import re
 import sys
 import time
 import json
-import signal
-from urllib.request import urlopen
+from urllib.request import Request, urlopen
 from urllib.parse import urlparse

 from decimal import Decimal
@ -25,6 +24,7 @@ from config import (
    TIMEOUT,
    SHOW_PROGRESS,
    CHECK_SSL_VALIDITY,
+    WGET_USER_AGENT,
    CURL_BINARY,
    WGET_BINARY,
    CHROME_BINARY,
@ -219,7 +219,21 @@ def save_source(raw_text):
    return source_path


-def download_url(url):
+def fetch_page_content(url, timeout=TIMEOUT):
+    req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
+
+    if CHECK_SSL_VALIDITY:
+        resp = urlopen(req, timeout=timeout)
+    else:
+        import ssl
+        insecure = ssl._create_unverified_context()
+        resp = urlopen(req, timeout=timeout, context=insecure)
+
+    encoding = resp.headers.get_content_charset() or 'utf-8'
+    return resp.read().decode(encoding)
+
+
+def download_url(url, timeout=TIMEOUT):
    """download a given url's content into downloads/domain.txt"""

    if not os.path.exists(SOURCES_DIR):
@ -236,7 +250,7 @@ def download_url(url):
    ))
    end = progress(TIMEOUT, prefix='      ')
    try:
-        downloaded_xml = urlopen(url).read().decode('utf-8')
+        downloaded_xml = fetch_page_content(url, timeout=timeout)
        end()
    except Exception as e:
        end()
@ -260,19 +274,15 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
            sys.stdout.write('.')
            sys.stdout.flush()

-        if CHECK_SSL_VALIDITY:
-            html_content = urlopen(url, timeout=timeout)
-        else:
-            try:
-                import ssl
-                insecure = ssl._create_unverified_context()
-                html_content = urlopen(url, timeout=timeout, context=insecure)
-            except ImportError:
-                html_content = urlopen(url, timeout=timeout)
+        html = fetch_page_content(url, timeout=timeout)

-        match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8'))
+        match = re.search(HTML_TITLE_REGEX, html)
        return match.group(1).strip() if match else None
-    except Exception:
+    except Exception as err:
+        # print('[!] Failed to fetch title because of {}: {}'.format(
+        #     err.__class__.__name__,
+        #     err,
+        # ))
        return None


@ -603,3 +613,15 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
            raise CalledProcessError(retcode, process.args,
                                     output=stdout, stderr=stderr)
    return CompletedProcess(process.args, retcode, stdout, stderr)
+
+
+def check_link_structure(link):
+    assert isinstance(link, dict)
+    assert isinstance(link.get('url'), str)
+    assert len(link['url']) > 2
+
+
+def check_links_structure(links):
+    assert isinstance(links, list)
+    if links:
+        check_link_structure(links[0])