mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 14:44:18 +00:00
move latest to derived data using history
This commit is contained in:
parent
69f837bbf6
commit
d06775923b
4 changed files with 34 additions and 49 deletions
|
@ -90,8 +90,6 @@ def archive_link(link_dir, link):
|
|||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||
if method_name not in link['history']:
|
||||
link['history'][method_name] = []
|
||||
if method_name not in link['latest']:
|
||||
link['latest'][method_name] = None
|
||||
|
||||
if not should_run(link_dir, link):
|
||||
continue
|
||||
|
@ -105,8 +103,6 @@ def archive_link(link_dir, link):
|
|||
log_archive_method_finished(result)
|
||||
|
||||
link['history'][method_name].append(result)
|
||||
if result['status'] == 'succeeded':
|
||||
link['latest'][method_name] = result['output']
|
||||
|
||||
write_link_index(link_dir, link)
|
||||
patch_links_index(link)
|
||||
|
|
|
@ -22,6 +22,7 @@ from util import (
|
|||
check_link_structure,
|
||||
check_links_structure,
|
||||
wget_output_path,
|
||||
latest_output,
|
||||
)
|
||||
from parse import parse_links
|
||||
from links import validate_links
|
||||
|
@ -168,8 +169,8 @@ def write_html_links_index(out_dir, links, finished=False):
|
|||
def patch_links_index(link, out_dir=OUTPUT_DIR):
|
||||
"""hack to in-place update one row's info in the generated index html"""
|
||||
|
||||
title = link['latest']['title']
|
||||
successful = len([entry for entry in link['latest'].values() if entry])
|
||||
title = link['title'] or latest_output(link)['title']
|
||||
successful = len(tuple(filter(None, latest_output(link).values())))
|
||||
|
||||
# Patch JSON index
|
||||
changed = False
|
||||
|
@ -177,7 +178,6 @@ def patch_links_index(link, out_dir=OUTPUT_DIR):
|
|||
for saved_link in json_file_links:
|
||||
if saved_link['url'] == link['url']:
|
||||
saved_link['title'] = title
|
||||
saved_link['latest'] = link['latest']
|
||||
saved_link['history'] = link['history']
|
||||
changed = True
|
||||
break
|
||||
|
@ -235,12 +235,10 @@ def load_json_link_index(out_dir, link):
|
|||
**link,
|
||||
}
|
||||
link.update({
|
||||
'latest': link.get('latest') or {},
|
||||
'history': link.get('history') or {},
|
||||
})
|
||||
|
||||
check_link_structure(link)
|
||||
|
||||
return link
|
||||
|
||||
def write_html_link_index(out_dir, link):
|
||||
|
|
|
@ -9,12 +9,6 @@ Link {
|
|||
title: str,
|
||||
tags: str,
|
||||
sources: [str],
|
||||
latest: {
|
||||
...,
|
||||
pdf: 'output.pdf',
|
||||
wget: 'example.com/1234/index.html',
|
||||
screenshot: null,
|
||||
},
|
||||
history: {
|
||||
pdf: [
|
||||
{start_ts, end_ts, duration, cmd, pwd, status, output},
|
||||
|
@ -30,7 +24,6 @@ from collections import OrderedDict
|
|||
|
||||
from util import (
|
||||
merge_links,
|
||||
wget_output_path,
|
||||
check_link_structure,
|
||||
check_links_structure,
|
||||
)
|
||||
|
@ -47,30 +40,9 @@ def validate_links(links):
|
|||
raise SystemExit(1)
|
||||
|
||||
for link in links:
|
||||
link['title'] = unescape(link['title'].strip()) if link['title'].strip() else None
|
||||
check_link_structure(link)
|
||||
|
||||
link['title'] = unescape(link['title']) if link['title'] else None
|
||||
link['latest'] = link.get('latest') or {}
|
||||
|
||||
latest = link['latest']
|
||||
if not link['latest'].get('wget'):
|
||||
link['latest']['wget'] = wget_output_path(link)
|
||||
|
||||
if not link['latest'].get('pdf'):
|
||||
link['latest']['pdf'] = None
|
||||
|
||||
if not link['latest'].get('screenshot'):
|
||||
link['latest']['screenshot'] = None
|
||||
|
||||
if not link['latest'].get('dom'):
|
||||
link['latest']['dom'] = None
|
||||
|
||||
if not latest.get('favicon'):
|
||||
latest['favicon'] = None
|
||||
|
||||
if not link['latest'].get('title'):
|
||||
link['latest']['title'] = link['title']
|
||||
|
||||
return list(links)
|
||||
|
||||
|
||||
|
|
|
@ -118,12 +118,6 @@ def check_link_structure(link):
|
|||
assert isinstance(key, str)
|
||||
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
|
||||
|
||||
if 'latest' in link:
|
||||
assert isinstance(link['latest'], dict), 'latest must be a Dict'
|
||||
for key, val in link['latest'].items():
|
||||
assert isinstance(key, str)
|
||||
assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest'])
|
||||
|
||||
def check_links_structure(links):
|
||||
"""basic sanity check invariants to make sure the data is valid"""
|
||||
assert isinstance(links, list)
|
||||
|
@ -304,10 +298,6 @@ def wget_output_path(link):
|
|||
See docs on wget --adjust-extension (-E)
|
||||
"""
|
||||
|
||||
# if we have it stored, always prefer the actual output path to computed one
|
||||
if link.get('latest', {}).get('wget'):
|
||||
return link['latest']['wget']
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return without_scheme(without_fragment(link['url']))
|
||||
|
||||
|
@ -433,7 +423,7 @@ def derived_link_info(link):
|
|||
link['timestamp'],
|
||||
domain(url),
|
||||
)),
|
||||
'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0,
|
||||
'num_outputs': len([entry for entry in latest_output(link).values() if entry]),
|
||||
}
|
||||
|
||||
# Archive Method Output URLs
|
||||
|
@ -465,6 +455,35 @@ def derived_link_info(link):
|
|||
return extended_info
|
||||
|
||||
|
||||
def latest_output(link, status=None):
|
||||
"""get the latest output that each archive method produced for link"""
|
||||
|
||||
latest = {
|
||||
'title': None,
|
||||
'favicon': None,
|
||||
'wget': None,
|
||||
'warc': None,
|
||||
'pdf': None,
|
||||
'screenshot': None,
|
||||
'dom': None,
|
||||
'git': None,
|
||||
'media': None,
|
||||
'archive_org': None,
|
||||
}
|
||||
for archive_method in latest.keys():
|
||||
# get most recent succesful result in history for each archive method
|
||||
history = link.get('history', {}).get(archive_method) or []
|
||||
history = filter(lambda result: result['output'], reversed(history))
|
||||
if status is not None:
|
||||
history = filter(lambda result: result['status'] == status, history)
|
||||
|
||||
history = list(history)
|
||||
if history:
|
||||
latest[archive_method] = history[0]['output']
|
||||
|
||||
return latest
|
||||
|
||||
|
||||
### Python / System Helpers
|
||||
|
||||
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
||||
|
|
Loading…
Reference in a new issue