move latest to derived data using history

This commit is contained in:
Nick Sweeting 2019-03-22 21:38:08 -04:00
parent 69f837bbf6
commit d06775923b
4 changed files with 34 additions and 49 deletions

View file

@ -90,8 +90,6 @@ def archive_link(link_dir, link):
for method_name, should_run, method_function in ARCHIVE_METHODS:
if method_name not in link['history']:
link['history'][method_name] = []
if method_name not in link['latest']:
link['latest'][method_name] = None
if not should_run(link_dir, link):
continue
@ -105,8 +103,6 @@ def archive_link(link_dir, link):
log_archive_method_finished(result)
link['history'][method_name].append(result)
if result['status'] == 'succeeded':
link['latest'][method_name] = result['output']
write_link_index(link_dir, link)
patch_links_index(link)

View file

@ -22,6 +22,7 @@ from util import (
check_link_structure,
check_links_structure,
wget_output_path,
latest_output,
)
from parse import parse_links
from links import validate_links
@ -168,8 +169,8 @@ def write_html_links_index(out_dir, links, finished=False):
def patch_links_index(link, out_dir=OUTPUT_DIR):
"""hack to in-place update one row's info in the generated index html"""
title = link['latest']['title']
successful = len([entry for entry in link['latest'].values() if entry])
title = link['title'] or latest_output(link)['title']
successful = len(tuple(filter(None, latest_output(link).values())))
# Patch JSON index
changed = False
@ -177,7 +178,6 @@ def patch_links_index(link, out_dir=OUTPUT_DIR):
for saved_link in json_file_links:
if saved_link['url'] == link['url']:
saved_link['title'] = title
saved_link['latest'] = link['latest']
saved_link['history'] = link['history']
changed = True
break
@ -235,12 +235,10 @@ def load_json_link_index(out_dir, link):
**link,
}
link.update({
'latest': link.get('latest') or {},
'history': link.get('history') or {},
})
check_link_structure(link)
return link
def write_html_link_index(out_dir, link):

View file

@ -9,12 +9,6 @@ Link {
title: str,
tags: str,
sources: [str],
latest: {
...,
pdf: 'output.pdf',
wget: 'example.com/1234/index.html',
screenshot: null,
},
history: {
pdf: [
{start_ts, end_ts, duration, cmd, pwd, status, output},
@ -30,7 +24,6 @@ from collections import OrderedDict
from util import (
merge_links,
wget_output_path,
check_link_structure,
check_links_structure,
)
@ -47,30 +40,9 @@ def validate_links(links):
raise SystemExit(1)
for link in links:
link['title'] = unescape(link['title'].strip()) if link['title'].strip() else None
check_link_structure(link)
link['title'] = unescape(link['title']) if link['title'] else None
link['latest'] = link.get('latest') or {}
latest = link['latest']
if not link['latest'].get('wget'):
link['latest']['wget'] = wget_output_path(link)
if not link['latest'].get('pdf'):
link['latest']['pdf'] = None
if not link['latest'].get('screenshot'):
link['latest']['screenshot'] = None
if not link['latest'].get('dom'):
link['latest']['dom'] = None
if not latest.get('favicon'):
latest['favicon'] = None
if not link['latest'].get('title'):
link['latest']['title'] = link['title']
return list(links)

View file

@ -118,12 +118,6 @@ def check_link_structure(link):
assert isinstance(key, str)
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
if 'latest' in link:
assert isinstance(link['latest'], dict), 'latest must be a Dict'
for key, val in link['latest'].items():
assert isinstance(key, str)
assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest'])
def check_links_structure(links):
"""basic sanity check invariants to make sure the data is valid"""
assert isinstance(links, list)
@ -304,10 +298,6 @@ def wget_output_path(link):
See docs on wget --adjust-extension (-E)
"""
# if we have it stored, always prefer the actual output path to computed one
if link.get('latest', {}).get('wget'):
return link['latest']['wget']
if is_static_file(link['url']):
return without_scheme(without_fragment(link['url']))
@ -433,7 +423,7 @@ def derived_link_info(link):
link['timestamp'],
domain(url),
)),
'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0,
'num_outputs': len([entry for entry in latest_output(link).values() if entry]),
}
# Archive Method Output URLs
@ -465,6 +455,35 @@ def derived_link_info(link):
return extended_info
def latest_output(link, status=None):
"""get the latest output that each archive method produced for link"""
latest = {
'title': None,
'favicon': None,
'wget': None,
'warc': None,
'pdf': None,
'screenshot': None,
'dom': None,
'git': None,
'media': None,
'archive_org': None,
}
for archive_method in latest.keys():
# get most recent succesful result in history for each archive method
history = link.get('history', {}).get(archive_method) or []
history = filter(lambda result: result['output'], reversed(history))
if status is not None:
history = filter(lambda result: result['status'] == status, history)
history = list(history)
if history:
latest[archive_method] = history[0]['output']
return latest
### Python / System Helpers
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):