working runtime type casting and enforcement for a wide range of types

This commit is contained in:
Nick Sweeting 2019-03-26 22:26:21 -04:00
parent 0d8a076c1f
commit ab09560f14
3 changed files with 162 additions and 61 deletions

View file

@ -1,7 +1,6 @@
import os import os
import json import json
from itertools import chain
from datetime import datetime from datetime import datetime
from string import Template from string import Template
from typing import List, Tuple, Iterator, Optional from typing import List, Tuple, Iterator, Optional
@ -20,13 +19,13 @@ from config import (
FOOTER_INFO, FOOTER_INFO,
) )
from util import ( from util import (
merge_links,
chmod_file, chmod_file,
urlencode, urlencode,
derived_link_info, derived_link_info,
wget_output_path, wget_output_path,
ExtendedEncoder, ExtendedEncoder,
check_link_structure, enforce_types,
check_links_structure,
) )
from parse import parse_links from parse import parse_links
from links import validate_links from links import validate_links
@ -43,6 +42,7 @@ TITLE_LOADING_MSG = 'Not yet archived...'
### Homepage index for all the links ### Homepage index for all the links
@enforce_types
def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None: def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
@ -55,8 +55,9 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
log_indexing_started(out_dir, 'index.html') log_indexing_started(out_dir, 'index.html')
write_html_links_index(out_dir, links, finished=finished) write_html_links_index(out_dir, links, finished=finished)
log_indexing_finished(out_dir, 'index.html') log_indexing_finished(out_dir, 'index.html')
@enforce_types
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]: def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
"""parse and load existing index with any new links from import_path merged in""" """parse and load existing index with any new links from import_path merged in"""
@ -81,6 +82,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
return all_links, new_links return all_links, new_links
@enforce_types
def write_json_links_index(out_dir: str, links: List[Link]) -> None: def write_json_links_index(out_dir: str, links: List[Link]) -> None:
"""write the json link index to a given path""" """write the json link index to a given path"""
@ -114,6 +116,7 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
chmod_file(path) chmod_file(path)
@enforce_types
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links""" """parse a archive index json file and return the list of links"""
@ -121,13 +124,13 @@ def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
if os.path.exists(index_path): if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links'] links = json.load(f)['links']
check_links_structure(links)
for link in links: for link in links:
yield Link(**link) yield Link(**link)
return () return ()
@enforce_types
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None: def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
"""write the html link index to a given path""" """write the html link index to a given path"""
@ -151,6 +154,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
link.title link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG) or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
), ),
'tags': link.tags or '',
'favicon_url': ( 'favicon_url': (
os.path.join('archive', link.timestamp, 'favicon.ico') os.path.join('archive', link.timestamp, 'favicon.ico')
# if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs=' # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
@ -179,6 +183,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
chmod_file(path) chmod_file(path)
@enforce_types
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None: def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""hack to in-place update one row's info in the generated index html""" """hack to in-place update one row's info in the generated index html"""
@ -218,11 +223,13 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
### Individual link index ### Individual link index
@enforce_types
def write_link_index(out_dir: str, link: Link) -> None: def write_link_index(out_dir: str, link: Link) -> None:
write_json_link_index(out_dir, link) write_json_link_index(out_dir, link)
write_html_link_index(out_dir, link) write_html_link_index(out_dir, link)
@enforce_types
def write_json_link_index(out_dir: str, link: Link) -> None: def write_json_link_index(out_dir: str, link: Link) -> None:
"""write a json file with some info about the link""" """write a json file with some info about the link"""
@ -234,29 +241,29 @@ def write_json_link_index(out_dir: str, link: Link) -> None:
chmod_file(path) chmod_file(path)
@enforce_types
def parse_json_link_index(out_dir: str) -> Optional[Link]: def parse_json_link_index(out_dir: str) -> Optional[Link]:
"""load the json link index from a given directory""" """load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json') existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index): if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f: with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f) link_json = json.load(f)
check_link_structure(link_json)
return Link(**link_json) return Link(**link_json)
return None return None
@enforce_types
def load_json_link_index(out_dir: str, link: Link) -> Link: def load_json_link_index(out_dir: str, link: Link) -> Link:
"""check for an existing link archive in the given directory, """check for an existing link archive in the given directory,
and load+merge it into the given link dict and load+merge it into the given link dict
""" """
existing_link = parse_json_link_index(out_dir) existing_link = parse_json_link_index(out_dir)
existing_link = existing_link._asdict() if existing_link else {} if existing_link:
new_link = link._asdict() return merge_links(existing_link, link)
return link
return Link(**{**existing_link, **new_link})
@enforce_types
def write_html_link_index(out_dir: str, link: Link) -> None: def write_html_link_index(out_dir: str, link: Link) -> None:
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
link_html = f.read() link_html = f.read()

View file

@ -39,15 +39,24 @@ class Link:
tags: Optional[str] tags: Optional[str]
sources: List[str] sources: List[str]
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {}) history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
updated: Optional[str] = None updated: Optional[datetime] = None
def __hash__(self): def __post_init__(self):
return self.urlhash """fix any history result items to be type-checked ArchiveResults"""
cast_history = {}
for method, method_history in self.history.items():
cast_history[method] = []
for result in method_history:
if isinstance(result, dict):
result = ArchiveResult(**result)
cast_history[method].append(result)
object.__setattr__(self, 'history', cast_history)
def __eq__(self, other): def __eq__(self, other):
if not isinstance(other, Link): if not isinstance(other, Link):
return NotImplemented return NotImplemented
return self.urlhash == other.urlhash return self.url == other.url
def __gt__(self, other): def __gt__(self, other):
if not isinstance(other, Link): if not isinstance(other, Link):

View file

@ -4,7 +4,9 @@ import sys
import time import time
from json import JSONEncoder from json import JSONEncoder
from typing import List, Optional, Iterable from typing import List, Optional, Any
from inspect import signature, _empty
from functools import wraps
from hashlib import sha256 from hashlib import sha256
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.parse import urlparse, quote, unquote from urllib.parse import urlparse, quote, unquote
@ -22,7 +24,7 @@ from subprocess import (
from base32_crockford import encode as base32_encode from base32_crockford import encode as base32_encode
from schema import Link, LinkDict, ArchiveResult from schema import Link
from config import ( from config import (
ANSI, ANSI,
TERM_WIDTH, TERM_WIDTH,
@ -55,26 +57,13 @@ fragment = lambda url: urlparse(url).fragment
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else '' extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
without_www = lambda url: url.replace('://www.', '://', 1) without_www = lambda url: url.replace('://www.', '://', 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?') without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower()))) fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
short_ts = lambda ts: ( short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
str(ts.timestamp()).split('.')[0] ts_to_date = lambda ts: parse_date(ts).strftime('%Y-%m-%d %H:%M')
if isinstance(ts, datetime) else ts_to_iso = lambda ts: parse_date(ts).isoformat()
str(ts).split('.')[0]
)
ts_to_date = lambda ts: (
ts.strftime('%Y-%m-%d %H:%M')
if isinstance(ts, datetime) else
datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
)
ts_to_iso = lambda ts: (
ts.isoformat()
if isinstance(ts, datetime) else
datetime.fromtimestamp(float(ts)).isoformat()
)
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace') urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
urldecode = lambda s: s and unquote(s) urldecode = lambda s: s and unquote(s)
@ -122,23 +111,46 @@ STATICFILE_EXTENSIONS = {
### Checks & Tests ### Checks & Tests
def check_link_structure(link: LinkDict) -> None: def enforce_types(func):
"""basic sanity check invariants to make sure the data is valid""" """
assert isinstance(link, dict) Checks parameters type signatures against arg and kwarg type hints.
assert isinstance(link.get('url'), str) """
assert len(link['url']) > 2
assert len(re.findall(URL_REGEX, link['url'])) == 1 @wraps(func)
if 'history' in link: def typechecked_function(*args, **kwargs):
assert isinstance(link['history'], dict), 'history must be a Dict' sig = signature(func)
for key, val in link['history'].items():
assert isinstance(key, str) def check_argument_type(arg_key, arg_val):
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history']) try:
annotation = sig.parameters[arg_key].annotation
def check_links_structure(links: Iterable[LinkDict]) -> None: except KeyError:
"""basic sanity check invariants to make sure the data is valid""" annotation = _empty
assert isinstance(links, list)
if links: if annotation is not _empty and annotation.__class__ is type:
check_link_structure(links[0]) if not isinstance(arg_val, annotation):
raise TypeError(
'{}(..., {}: {}) got unexpected {} argument {}={}'.format(
func.__name__,
arg_key,
annotation.__name__,
type(arg_val).__name__,
arg_key,
arg_val,
)
)
# check args
for arg_val, arg_key in zip(args, sig.parameters):
check_argument_type(arg_key, arg_val)
# check kwargs
for arg_key, arg_val in kwargs.items():
check_argument_type(arg_key, arg_val)
return func(*args, **kwargs)
return typechecked_function
def check_url_parsing_invariants() -> None: def check_url_parsing_invariants() -> None:
"""Check that plain text regex URL parsing works as expected""" """Check that plain text regex URL parsing works as expected"""
@ -329,25 +341,98 @@ def str_between(string: str, start: str, end: str=None) -> str:
return content return content
def parse_date(date: Any) -> Optional[datetime]:
"""Parse unix timestamps, iso format, and human-readable strings"""
if isinstance(date, datetime):
return date
if date is None:
return None
if isinstance(date, (float, int)):
date = str(date)
if isinstance(date, str):
if date.replace('.', '').isdigit():
timestamp = float(date)
EARLIEST_POSSIBLE = 473403600.0 # 1985
LATEST_POSSIBLE = 1735707600.0 # 2025
if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
# number is seconds
return datetime.fromtimestamp(timestamp)
elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
# number is milliseconds
return datetime.fromtimestamp(timestamp / 1000)
elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
# number is microseconds
return datetime.fromtimestamp(timestamp / (1000*1000))
if '-' in date:
try:
return datetime.fromisoformat(date)
except Exception:
try:
return datetime.strptime(date, '%Y-%m-%d %H:%M')
except Exception:
pass
raise ValueError('Tried to parse invalid date! {}'.format(date))
### Link Helpers ### Link Helpers
@enforce_types
def merge_links(a: Link, b: Link) -> Link: def merge_links(a: Link, b: Link) -> Link:
"""deterministially merge two links, favoring longer field values over shorter, """deterministially merge two links, favoring longer field values over shorter,
and "cleaner" values over worse ones. and "cleaner" values over worse ones.
""" """
a, b = a._asdict(), b._asdict() assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
earlier = lambda key: a[key] if a[key] < b[key] else b[key] url = a.url if len(a.url) > len(b.url) else b.url
url = longer('url') possible_titles = [
longest_title = longer('title') title
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title'] for title in (a.title, b.title)
if title and title.strip() and '://' not in title
]
title = None
if len(possible_titles) == 2:
title = max(possible_titles, key=lambda t: len(t))
elif len(possible_titles) == 1:
title = possible_titles[0]
timestamp = (
a.timestamp
if float(a.timestamp or 0) < float(b.timestamp or 0) else
b.timestamp
)
tags_set = (
set(tag.strip() for tag in (a.tags or '').split(','))
| set(tag.strip() for tag in (b.tags or '').split(','))
)
tags = ','.join(tags_set) or None
sources = list(set(a.sources + b.sources))
all_methods = (set(a.history.keys()) | set(a.history.keys()))
history = {
method: (a.history.get(method) or []) + (b.history.get(method) or [])
for method in all_methods
}
return Link( return Link(
url=url, url=url,
timestamp=earlier('timestamp'), timestamp=timestamp,
title=longest_title if '://' not in (longest_title or '') else cleanest_title, title=title,
tags=longer('tags'), tags=tags,
sources=list(set(a.get('sources', []) + b.get('sources', []))), sources=sources,
history=history,
) )
def is_static_file(url: str) -> bool: def is_static_file(url: str) -> bool: