better logging during long output

This commit is contained in:
Nick Sweeting 2019-03-22 15:09:39 -04:00
parent 1c5732d5c6
commit bd9f3e313f
6 changed files with 63 additions and 72 deletions

View file

@ -94,7 +94,7 @@ def main(*args):
def update_archive_data(import_path=None, resume=None):
"""The main ArchiveBox entrancepoint. Everything starts here."""
"""The main ArchiveBox entrancepoint. Everything starts here."""
check_dependencies()
# Step 1: Load list of links from the existing index

View file

@ -1,6 +1,5 @@
import os
from functools import wraps
from collections import defaultdict
from datetime import datetime
@ -50,10 +49,9 @@ from util import (
run, PIPE, DEVNULL
)
from logs import (
_LAST_RUN_STATS,
log_link_archiving_started,
log_link_archiving_finished,
log_archive_method_starting,
log_archive_method_started,
log_archive_method_finished,
)
@ -94,6 +92,7 @@ def archive_link(link_dir, link):
link['history'][method_name] = []
if method_name not in link['latest']:
link['latest'][method_name] = None
if not should_run(link_dir, link):
continue
@ -101,7 +100,7 @@ def archive_link(link_dir, link):
skipped_entirely = False
print()
log_archive_method_starting(method_name)
log_archive_method_started(method_name)
result = method_function(link_dir, link)
log_archive_method_finished(result)
@ -109,11 +108,6 @@ def archive_link(link_dir, link):
if result['status'] == 'succeeded':
link['latest'][method_name] = result['output']
if result['status'] != 'skipped':
made_changes = True
_LAST_RUN_STATS[result['status']] += 1
write_link_index(link_dir, link)
patch_links_index(link)
@ -126,6 +120,7 @@ def archive_link(link_dir, link):
return link
### Archive Method Functions
def should_fetch_title(link_dir, link):
# if link already has valid title, skip it
@ -428,8 +423,8 @@ def should_fetch_git(link_dir, link):
return False
is_clonable_url = (
domain(link['url']) in GIT_DOMAINS
or extension(link['url']) == 'git'
(domain(link['url']) in GIT_DOMAINS)
or (extension(link['url']) == 'git')
)
if not is_clonable_url:
return False
@ -477,6 +472,7 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
**timer.stats,
}
def should_fetch_media(link_dir, link):
if is_static_file(link['url']):
return False
@ -547,21 +543,6 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
**timer.stats,
}
def parse_archive_dot_org_response(response):
# Parse archive.org response headers
headers = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors
def should_fetch_archive_dot_org(link_dir, link):
if is_static_file(link['url']):
@ -627,4 +608,18 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
**timer.stats,
}
def parse_archive_dot_org_response(response):
# Parse archive.org response headers
headers = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors

View file

@ -26,6 +26,7 @@ from util import (
from parse import parse_links
from links import validate_links
from logs import (
log_indexing_process_started,
log_indexing_started,
log_indexing_finished,
log_parsing_started,
@ -40,12 +41,14 @@ TITLE_LOADING_MSG = 'Not yet archived...'
def write_links_index(out_dir, links, finished=False):
"""create index.html file for a given list of links"""
log_indexing_started()
log_indexing_process_started()
check_links_structure(links)
log_indexing_started(out_dir, 'index.json')
write_json_links_index(out_dir, links)
log_indexing_finished(out_dir, 'index.json')
log_indexing_started(out_dir, 'index.html')
write_html_links_index(out_dir, links, finished=finished)
log_indexing_finished(out_dir, 'index.html')

View file

@ -3,33 +3,26 @@ In ArchiveBox, a Link represents a single entry that we track in the
json index. All links pass through all archiver functions and the latest,
most up-to-date canonical output for each is stored in "latest".
Link {
timestamp: str, (how we uniquely id links) _ _ _ _ ___
url: str, | \ / \ |\| ' |
base_url: str, |_/ \_/ | | |
domain: str, _ _ _ _ _ _
tags: str, |_) /| |\| | / `
type: str, | /"| | | | \_,
title: str, ,-'"`-.
sources: [str], /// / @ @ \ \\\\
latest: { \ :=| ,._,. |=: /
..., || ,\ \_../ /. ||
pdf: 'output.pdf', ||','`-._))'`.`||
wget: 'example.com/1234/index.html' `-' (/ `-'
timestamp: str, (how we uniquely id links)
url: str,
title: str,
tags: str,
sources: [str],
latest: {
...,
pdf: 'output.pdf',
wget: 'example.com/1234/index.html',
screenshot: null,
},
history: {
...
pdf: [
{timestamp: 15444234325, status: 'skipped', result='output.pdf'},
{start_ts, end_ts, duration, cmd, pwd, status, output},
...
],
wget: [
{timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
]
...
},
}
"""
from html import unescape

View file

@ -45,13 +45,21 @@ def log_link_archiving_started(link_dir, link, is_new):
))
def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
if all(output == 'succeeded' for output in link['latest']):
_LAST_RUN_STATS['succeeded'] += 1
elif skipped_entirely or all(output == 'skipped' for output in link['latest']):
_LAST_RUN_STATS['skipped'] += 1
else:
_LAST_RUN_STATS['failed'] += 1
# import ipdb; ipdb.set_trace()
if skipped_entirely:
print('\r{}{}'.format(
pretty_path(link_dir),
' (new)' if is_new else '',
))
def log_archive_method_starting(method):
def log_archive_method_started(method):
print(' > {}'.format(method))
def log_archive_method_finished(result):
@ -117,7 +125,7 @@ def log_parsing_finished(num_new_links, parser_name):
parser_name,
))
def log_indexing_started():
def log_indexing_process_started():
start_ts = datetime.now()
_LAST_RUN_STATS['index_start_ts'] = start_ts
print('{green}[*] [{}] Saving main index files...{reset}'.format(
@ -125,10 +133,13 @@ def log_indexing_started():
**ANSI,
))
def log_indexing_started(out_dir, out_file):
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_indexing_finished(out_dir, out_file):
end_ts = datetime.now()
_LAST_RUN_STATS['index_end_ts'] = end_ts
print('{}/{}'.format(pretty_path(out_dir), out_file))
print('\r{}/{}'.format(pretty_path(out_dir), out_file))
def log_archiving_started(num_links, resume):
start_ts = datetime.now()

View file

@ -314,10 +314,20 @@ def wget_output_path(link):
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > output/archive/<timestamp>/example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc
# > output/archive/<timestamp>/example.com/abc.html
# https://example.com/abc/
# > output/archive/<timestamp>/example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc/test.html
# > output/archive/<timestamp>/example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
@ -326,7 +336,7 @@ def wget_output_path(link):
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
@ -359,27 +369,6 @@ def wget_output_path(link):
return None
# If finding the actual output file didn't work, fall back to the buggy
# implementation of the wget .html appending algorithm
# split_url = link['url'].split('#', 1)
# query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
# if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# # already ends in .html
# return urlencode(base_url(link['url']))
# else:
# # .html needs to be appended
# without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
# if without_scheme.endswith('/'):
# if query:
# return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
# return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
# else:
# if query:
# return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
# elif '/' in without_scheme:
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
# return urlencode(base_url(link['url']) + '/index.html')
### String Manipulation & Logging Helpers