mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 14:10:20 +00:00
working runtime type casting and enforcement for a wide range of types
This commit is contained in:
parent
0d8a076c1f
commit
ab09560f14
3 changed files with 162 additions and 61 deletions
|
@ -1,7 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from itertools import chain
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from string import Template
|
from string import Template
|
||||||
from typing import List, Tuple, Iterator, Optional
|
from typing import List, Tuple, Iterator, Optional
|
||||||
|
@ -20,13 +19,13 @@ from config import (
|
||||||
FOOTER_INFO,
|
FOOTER_INFO,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
|
merge_links,
|
||||||
chmod_file,
|
chmod_file,
|
||||||
urlencode,
|
urlencode,
|
||||||
derived_link_info,
|
derived_link_info,
|
||||||
wget_output_path,
|
wget_output_path,
|
||||||
ExtendedEncoder,
|
ExtendedEncoder,
|
||||||
check_link_structure,
|
enforce_types,
|
||||||
check_links_structure,
|
|
||||||
)
|
)
|
||||||
from parse import parse_links
|
from parse import parse_links
|
||||||
from links import validate_links
|
from links import validate_links
|
||||||
|
@ -43,6 +42,7 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
|
|
||||||
### Homepage index for all the links
|
### Homepage index for all the links
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
||||||
"""create index.html file for a given list of links"""
|
"""create index.html file for a given list of links"""
|
||||||
|
|
||||||
|
@ -55,8 +55,9 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
|
||||||
log_indexing_started(out_dir, 'index.html')
|
log_indexing_started(out_dir, 'index.html')
|
||||||
write_html_links_index(out_dir, links, finished=finished)
|
write_html_links_index(out_dir, links, finished=finished)
|
||||||
log_indexing_finished(out_dir, 'index.html')
|
log_indexing_finished(out_dir, 'index.html')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
|
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
|
||||||
"""parse and load existing index with any new links from import_path merged in"""
|
"""parse and load existing index with any new links from import_path merged in"""
|
||||||
|
|
||||||
|
@ -81,6 +82,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
|
||||||
return all_links, new_links
|
return all_links, new_links
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
||||||
"""write the json link index to a given path"""
|
"""write the json link index to a given path"""
|
||||||
|
|
||||||
|
@ -114,6 +116,7 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
"""parse a archive index json file and return the list of links"""
|
"""parse a archive index json file and return the list of links"""
|
||||||
|
|
||||||
|
@ -121,13 +124,13 @@ def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
if os.path.exists(index_path):
|
if os.path.exists(index_path):
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
links = json.load(f)['links']
|
links = json.load(f)['links']
|
||||||
check_links_structure(links)
|
|
||||||
for link in links:
|
for link in links:
|
||||||
yield Link(**link)
|
yield Link(**link)
|
||||||
|
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
||||||
"""write the html link index to a given path"""
|
"""write the html link index to a given path"""
|
||||||
|
|
||||||
|
@ -151,6 +154,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
|
||||||
link.title
|
link.title
|
||||||
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
||||||
),
|
),
|
||||||
|
'tags': link.tags or '',
|
||||||
'favicon_url': (
|
'favicon_url': (
|
||||||
os.path.join('archive', link.timestamp, 'favicon.ico')
|
os.path.join('archive', link.timestamp, 'favicon.ico')
|
||||||
# if link['is_archived'] else ''
|
# if link['is_archived'] else ''
|
||||||
|
@ -179,6 +183,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
"""hack to in-place update one row's info in the generated index html"""
|
"""hack to in-place update one row's info in the generated index html"""
|
||||||
|
|
||||||
|
@ -218,11 +223,13 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
|
|
||||||
### Individual link index
|
### Individual link index
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def write_link_index(out_dir: str, link: Link) -> None:
|
def write_link_index(out_dir: str, link: Link) -> None:
|
||||||
write_json_link_index(out_dir, link)
|
write_json_link_index(out_dir, link)
|
||||||
write_html_link_index(out_dir, link)
|
write_html_link_index(out_dir, link)
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def write_json_link_index(out_dir: str, link: Link) -> None:
|
def write_json_link_index(out_dir: str, link: Link) -> None:
|
||||||
"""write a json file with some info about the link"""
|
"""write a json file with some info about the link"""
|
||||||
|
|
||||||
|
@ -234,29 +241,29 @@ def write_json_link_index(out_dir: str, link: Link) -> None:
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def parse_json_link_index(out_dir: str) -> Optional[Link]:
|
def parse_json_link_index(out_dir: str) -> Optional[Link]:
|
||||||
"""load the json link index from a given directory"""
|
"""load the json link index from a given directory"""
|
||||||
existing_index = os.path.join(out_dir, 'index.json')
|
existing_index = os.path.join(out_dir, 'index.json')
|
||||||
if os.path.exists(existing_index):
|
if os.path.exists(existing_index):
|
||||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||||
link_json = json.load(f)
|
link_json = json.load(f)
|
||||||
check_link_structure(link_json)
|
|
||||||
return Link(**link_json)
|
return Link(**link_json)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def load_json_link_index(out_dir: str, link: Link) -> Link:
|
def load_json_link_index(out_dir: str, link: Link) -> Link:
|
||||||
"""check for an existing link archive in the given directory,
|
"""check for an existing link archive in the given directory,
|
||||||
and load+merge it into the given link dict
|
and load+merge it into the given link dict
|
||||||
"""
|
"""
|
||||||
|
|
||||||
existing_link = parse_json_link_index(out_dir)
|
existing_link = parse_json_link_index(out_dir)
|
||||||
existing_link = existing_link._asdict() if existing_link else {}
|
if existing_link:
|
||||||
new_link = link._asdict()
|
return merge_links(existing_link, link)
|
||||||
|
return link
|
||||||
return Link(**{**existing_link, **new_link})
|
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def write_html_link_index(out_dir: str, link: Link) -> None:
|
def write_html_link_index(out_dir: str, link: Link) -> None:
|
||||||
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
||||||
link_html = f.read()
|
link_html = f.read()
|
||||||
|
|
|
@ -39,15 +39,24 @@ class Link:
|
||||||
tags: Optional[str]
|
tags: Optional[str]
|
||||||
sources: List[str]
|
sources: List[str]
|
||||||
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
|
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
|
||||||
updated: Optional[str] = None
|
updated: Optional[datetime] = None
|
||||||
|
|
||||||
def __hash__(self):
|
def __post_init__(self):
|
||||||
return self.urlhash
|
"""fix any history result items to be type-checked ArchiveResults"""
|
||||||
|
cast_history = {}
|
||||||
|
for method, method_history in self.history.items():
|
||||||
|
cast_history[method] = []
|
||||||
|
for result in method_history:
|
||||||
|
if isinstance(result, dict):
|
||||||
|
result = ArchiveResult(**result)
|
||||||
|
cast_history[method].append(result)
|
||||||
|
|
||||||
|
object.__setattr__(self, 'history', cast_history)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
if not isinstance(other, Link):
|
if not isinstance(other, Link):
|
||||||
return NotImplemented
|
return NotImplemented
|
||||||
return self.urlhash == other.urlhash
|
return self.url == other.url
|
||||||
|
|
||||||
def __gt__(self, other):
|
def __gt__(self, other):
|
||||||
if not isinstance(other, Link):
|
if not isinstance(other, Link):
|
||||||
|
|
|
@ -4,7 +4,9 @@ import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from json import JSONEncoder
|
from json import JSONEncoder
|
||||||
from typing import List, Optional, Iterable
|
from typing import List, Optional, Any
|
||||||
|
from inspect import signature, _empty
|
||||||
|
from functools import wraps
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
from urllib.parse import urlparse, quote, unquote
|
from urllib.parse import urlparse, quote, unquote
|
||||||
|
@ -22,7 +24,7 @@ from subprocess import (
|
||||||
|
|
||||||
from base32_crockford import encode as base32_encode
|
from base32_crockford import encode as base32_encode
|
||||||
|
|
||||||
from schema import Link, LinkDict, ArchiveResult
|
from schema import Link
|
||||||
from config import (
|
from config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
TERM_WIDTH,
|
TERM_WIDTH,
|
||||||
|
@ -55,26 +57,13 @@ fragment = lambda url: urlparse(url).fragment
|
||||||
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
|
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
|
||||||
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||||
|
|
||||||
|
|
||||||
without_www = lambda url: url.replace('://www.', '://', 1)
|
without_www = lambda url: url.replace('://www.', '://', 1)
|
||||||
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
||||||
fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
|
fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
|
||||||
|
|
||||||
short_ts = lambda ts: (
|
short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
|
||||||
str(ts.timestamp()).split('.')[0]
|
ts_to_date = lambda ts: parse_date(ts).strftime('%Y-%m-%d %H:%M')
|
||||||
if isinstance(ts, datetime) else
|
ts_to_iso = lambda ts: parse_date(ts).isoformat()
|
||||||
str(ts).split('.')[0]
|
|
||||||
)
|
|
||||||
ts_to_date = lambda ts: (
|
|
||||||
ts.strftime('%Y-%m-%d %H:%M')
|
|
||||||
if isinstance(ts, datetime) else
|
|
||||||
datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
|
|
||||||
)
|
|
||||||
ts_to_iso = lambda ts: (
|
|
||||||
ts.isoformat()
|
|
||||||
if isinstance(ts, datetime) else
|
|
||||||
datetime.fromtimestamp(float(ts)).isoformat()
|
|
||||||
)
|
|
||||||
|
|
||||||
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
|
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
|
||||||
urldecode = lambda s: s and unquote(s)
|
urldecode = lambda s: s and unquote(s)
|
||||||
|
@ -122,23 +111,46 @@ STATICFILE_EXTENSIONS = {
|
||||||
|
|
||||||
### Checks & Tests
|
### Checks & Tests
|
||||||
|
|
||||||
def check_link_structure(link: LinkDict) -> None:
|
def enforce_types(func):
|
||||||
"""basic sanity check invariants to make sure the data is valid"""
|
"""
|
||||||
assert isinstance(link, dict)
|
Checks parameters type signatures against arg and kwarg type hints.
|
||||||
assert isinstance(link.get('url'), str)
|
"""
|
||||||
assert len(link['url']) > 2
|
|
||||||
assert len(re.findall(URL_REGEX, link['url'])) == 1
|
@wraps(func)
|
||||||
if 'history' in link:
|
def typechecked_function(*args, **kwargs):
|
||||||
assert isinstance(link['history'], dict), 'history must be a Dict'
|
sig = signature(func)
|
||||||
for key, val in link['history'].items():
|
|
||||||
assert isinstance(key, str)
|
def check_argument_type(arg_key, arg_val):
|
||||||
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
|
try:
|
||||||
|
annotation = sig.parameters[arg_key].annotation
|
||||||
def check_links_structure(links: Iterable[LinkDict]) -> None:
|
except KeyError:
|
||||||
"""basic sanity check invariants to make sure the data is valid"""
|
annotation = _empty
|
||||||
assert isinstance(links, list)
|
|
||||||
if links:
|
if annotation is not _empty and annotation.__class__ is type:
|
||||||
check_link_structure(links[0])
|
if not isinstance(arg_val, annotation):
|
||||||
|
raise TypeError(
|
||||||
|
'{}(..., {}: {}) got unexpected {} argument {}={}'.format(
|
||||||
|
func.__name__,
|
||||||
|
arg_key,
|
||||||
|
annotation.__name__,
|
||||||
|
type(arg_val).__name__,
|
||||||
|
arg_key,
|
||||||
|
arg_val,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# check args
|
||||||
|
for arg_val, arg_key in zip(args, sig.parameters):
|
||||||
|
check_argument_type(arg_key, arg_val)
|
||||||
|
|
||||||
|
# check kwargs
|
||||||
|
for arg_key, arg_val in kwargs.items():
|
||||||
|
check_argument_type(arg_key, arg_val)
|
||||||
|
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
return typechecked_function
|
||||||
|
|
||||||
|
|
||||||
def check_url_parsing_invariants() -> None:
|
def check_url_parsing_invariants() -> None:
|
||||||
"""Check that plain text regex URL parsing works as expected"""
|
"""Check that plain text regex URL parsing works as expected"""
|
||||||
|
@ -329,25 +341,98 @@ def str_between(string: str, start: str, end: str=None) -> str:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date(date: Any) -> Optional[datetime]:
|
||||||
|
"""Parse unix timestamps, iso format, and human-readable strings"""
|
||||||
|
|
||||||
|
if isinstance(date, datetime):
|
||||||
|
return date
|
||||||
|
|
||||||
|
if date is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if isinstance(date, (float, int)):
|
||||||
|
date = str(date)
|
||||||
|
|
||||||
|
if isinstance(date, str):
|
||||||
|
if date.replace('.', '').isdigit():
|
||||||
|
timestamp = float(date)
|
||||||
|
|
||||||
|
EARLIEST_POSSIBLE = 473403600.0 # 1985
|
||||||
|
LATEST_POSSIBLE = 1735707600.0 # 2025
|
||||||
|
|
||||||
|
if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
|
||||||
|
# number is seconds
|
||||||
|
return datetime.fromtimestamp(timestamp)
|
||||||
|
elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
|
||||||
|
# number is milliseconds
|
||||||
|
return datetime.fromtimestamp(timestamp / 1000)
|
||||||
|
|
||||||
|
elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
|
||||||
|
# number is microseconds
|
||||||
|
return datetime.fromtimestamp(timestamp / (1000*1000))
|
||||||
|
|
||||||
|
if '-' in date:
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(date)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
return datetime.strptime(date, '%Y-%m-%d %H:%M')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise ValueError('Tried to parse invalid date! {}'.format(date))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Link Helpers
|
### Link Helpers
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def merge_links(a: Link, b: Link) -> Link:
|
def merge_links(a: Link, b: Link) -> Link:
|
||||||
"""deterministially merge two links, favoring longer field values over shorter,
|
"""deterministially merge two links, favoring longer field values over shorter,
|
||||||
and "cleaner" values over worse ones.
|
and "cleaner" values over worse ones.
|
||||||
"""
|
"""
|
||||||
a, b = a._asdict(), b._asdict()
|
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
|
||||||
longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
|
|
||||||
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
url = a.url if len(a.url) > len(b.url) else b.url
|
||||||
|
|
||||||
url = longer('url')
|
possible_titles = [
|
||||||
longest_title = longer('title')
|
title
|
||||||
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
|
for title in (a.title, b.title)
|
||||||
|
if title and title.strip() and '://' not in title
|
||||||
|
]
|
||||||
|
title = None
|
||||||
|
if len(possible_titles) == 2:
|
||||||
|
title = max(possible_titles, key=lambda t: len(t))
|
||||||
|
elif len(possible_titles) == 1:
|
||||||
|
title = possible_titles[0]
|
||||||
|
|
||||||
|
timestamp = (
|
||||||
|
a.timestamp
|
||||||
|
if float(a.timestamp or 0) < float(b.timestamp or 0) else
|
||||||
|
b.timestamp
|
||||||
|
)
|
||||||
|
|
||||||
|
tags_set = (
|
||||||
|
set(tag.strip() for tag in (a.tags or '').split(','))
|
||||||
|
| set(tag.strip() for tag in (b.tags or '').split(','))
|
||||||
|
)
|
||||||
|
tags = ','.join(tags_set) or None
|
||||||
|
|
||||||
|
sources = list(set(a.sources + b.sources))
|
||||||
|
|
||||||
|
all_methods = (set(a.history.keys()) | set(a.history.keys()))
|
||||||
|
history = {
|
||||||
|
method: (a.history.get(method) or []) + (b.history.get(method) or [])
|
||||||
|
for method in all_methods
|
||||||
|
}
|
||||||
|
|
||||||
return Link(
|
return Link(
|
||||||
url=url,
|
url=url,
|
||||||
timestamp=earlier('timestamp'),
|
timestamp=timestamp,
|
||||||
title=longest_title if '://' not in (longest_title or '') else cleanest_title,
|
title=title,
|
||||||
tags=longer('tags'),
|
tags=tags,
|
||||||
sources=list(set(a.get('sources', []) + b.get('sources', []))),
|
sources=sources,
|
||||||
|
history=history,
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_static_file(url: str) -> bool:
|
def is_static_file(url: str) -> bool:
|
||||||
|
|
Loading…
Reference in a new issue