mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 20:23:12 +00:00
better logging during long output
This commit is contained in:
parent
1c5732d5c6
commit
bd9f3e313f
6 changed files with 63 additions and 72 deletions
|
@ -94,7 +94,7 @@ def main(*args):
|
|||
|
||||
|
||||
def update_archive_data(import_path=None, resume=None):
|
||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||
check_dependencies()
|
||||
|
||||
# Step 1: Load list of links from the existing index
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import os
|
||||
|
||||
from functools import wraps
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
|
@ -50,10 +49,9 @@ from util import (
|
|||
run, PIPE, DEVNULL
|
||||
)
|
||||
from logs import (
|
||||
_LAST_RUN_STATS,
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_finished,
|
||||
log_archive_method_starting,
|
||||
log_archive_method_started,
|
||||
log_archive_method_finished,
|
||||
)
|
||||
|
||||
|
@ -94,6 +92,7 @@ def archive_link(link_dir, link):
|
|||
link['history'][method_name] = []
|
||||
if method_name not in link['latest']:
|
||||
link['latest'][method_name] = None
|
||||
|
||||
if not should_run(link_dir, link):
|
||||
continue
|
||||
|
||||
|
@ -101,7 +100,7 @@ def archive_link(link_dir, link):
|
|||
skipped_entirely = False
|
||||
print()
|
||||
|
||||
log_archive_method_starting(method_name)
|
||||
log_archive_method_started(method_name)
|
||||
result = method_function(link_dir, link)
|
||||
log_archive_method_finished(result)
|
||||
|
||||
|
@ -109,11 +108,6 @@ def archive_link(link_dir, link):
|
|||
if result['status'] == 'succeeded':
|
||||
link['latest'][method_name] = result['output']
|
||||
|
||||
if result['status'] != 'skipped':
|
||||
made_changes = True
|
||||
|
||||
_LAST_RUN_STATS[result['status']] += 1
|
||||
|
||||
write_link_index(link_dir, link)
|
||||
patch_links_index(link)
|
||||
|
||||
|
@ -126,6 +120,7 @@ def archive_link(link_dir, link):
|
|||
return link
|
||||
|
||||
|
||||
### Archive Method Functions
|
||||
|
||||
def should_fetch_title(link_dir, link):
|
||||
# if link already has valid title, skip it
|
||||
|
@ -428,8 +423,8 @@ def should_fetch_git(link_dir, link):
|
|||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
domain(link['url']) in GIT_DOMAINS
|
||||
or extension(link['url']) == 'git'
|
||||
(domain(link['url']) in GIT_DOMAINS)
|
||||
or (extension(link['url']) == 'git')
|
||||
)
|
||||
if not is_clonable_url:
|
||||
return False
|
||||
|
@ -477,6 +472,7 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
|
|||
**timer.stats,
|
||||
}
|
||||
|
||||
|
||||
def should_fetch_media(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
@ -547,21 +543,6 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
|
|||
**timer.stats,
|
||||
}
|
||||
|
||||
def parse_archive_dot_org_response(response):
|
||||
# Parse archive.org response headers
|
||||
headers = defaultdict(list)
|
||||
|
||||
# lowercase all the header names and store in dict
|
||||
for header in response.splitlines():
|
||||
if b':' not in header or not header.strip():
|
||||
continue
|
||||
name, val = header.decode().split(':', 1)
|
||||
headers[name.lower().strip()].append(val.strip())
|
||||
|
||||
# Get successful archive url in "content-location" header or any errors
|
||||
content_location = headers['content-location']
|
||||
errors = headers['x-archive-wayback-runtime-error']
|
||||
return content_location, errors
|
||||
|
||||
def should_fetch_archive_dot_org(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
|
@ -627,4 +608,18 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
|||
**timer.stats,
|
||||
}
|
||||
|
||||
def parse_archive_dot_org_response(response):
|
||||
# Parse archive.org response headers
|
||||
headers = defaultdict(list)
|
||||
|
||||
# lowercase all the header names and store in dict
|
||||
for header in response.splitlines():
|
||||
if b':' not in header or not header.strip():
|
||||
continue
|
||||
name, val = header.decode().split(':', 1)
|
||||
headers[name.lower().strip()].append(val.strip())
|
||||
|
||||
# Get successful archive url in "content-location" header or any errors
|
||||
content_location = headers['content-location']
|
||||
errors = headers['x-archive-wayback-runtime-error']
|
||||
return content_location, errors
|
||||
|
|
|
@ -26,6 +26,7 @@ from util import (
|
|||
from parse import parse_links
|
||||
from links import validate_links
|
||||
from logs import (
|
||||
log_indexing_process_started,
|
||||
log_indexing_started,
|
||||
log_indexing_finished,
|
||||
log_parsing_started,
|
||||
|
@ -40,12 +41,14 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
|||
def write_links_index(out_dir, links, finished=False):
|
||||
"""create index.html file for a given list of links"""
|
||||
|
||||
log_indexing_started()
|
||||
log_indexing_process_started()
|
||||
check_links_structure(links)
|
||||
|
||||
log_indexing_started(out_dir, 'index.json')
|
||||
write_json_links_index(out_dir, links)
|
||||
log_indexing_finished(out_dir, 'index.json')
|
||||
|
||||
log_indexing_started(out_dir, 'index.html')
|
||||
write_html_links_index(out_dir, links, finished=finished)
|
||||
log_indexing_finished(out_dir, 'index.html')
|
||||
|
||||
|
|
|
@ -3,33 +3,26 @@ In ArchiveBox, a Link represents a single entry that we track in the
|
|||
json index. All links pass through all archiver functions and the latest,
|
||||
most up-to-date canonical output for each is stored in "latest".
|
||||
|
||||
|
||||
Link {
|
||||
timestamp: str, (how we uniquely id links) _ _ _ _ ___
|
||||
url: str, | \ / \ |\| ' |
|
||||
base_url: str, |_/ \_/ | | |
|
||||
domain: str, _ _ _ _ _ _
|
||||
tags: str, |_) /| |\| | / `
|
||||
type: str, | /"| | | | \_,
|
||||
title: str, ,-'"`-.
|
||||
sources: [str], /// / @ @ \ \\\\
|
||||
latest: { \ :=| ,._,. |=: /
|
||||
..., || ,\ \_../ /. ||
|
||||
pdf: 'output.pdf', ||','`-._))'`.`||
|
||||
wget: 'example.com/1234/index.html' `-' (/ `-'
|
||||
timestamp: str, (how we uniquely id links)
|
||||
url: str,
|
||||
title: str,
|
||||
tags: str,
|
||||
sources: [str],
|
||||
latest: {
|
||||
...,
|
||||
pdf: 'output.pdf',
|
||||
wget: 'example.com/1234/index.html',
|
||||
screenshot: null,
|
||||
},
|
||||
history: {
|
||||
...
|
||||
pdf: [
|
||||
{timestamp: 15444234325, status: 'skipped', result='output.pdf'},
|
||||
{start_ts, end_ts, duration, cmd, pwd, status, output},
|
||||
...
|
||||
],
|
||||
wget: [
|
||||
{timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
|
||||
]
|
||||
...
|
||||
},
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
from html import unescape
|
||||
|
|
|
@ -45,13 +45,21 @@ def log_link_archiving_started(link_dir, link, is_new):
|
|||
))
|
||||
|
||||
def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
|
||||
if all(output == 'succeeded' for output in link['latest']):
|
||||
_LAST_RUN_STATS['succeeded'] += 1
|
||||
elif skipped_entirely or all(output == 'skipped' for output in link['latest']):
|
||||
_LAST_RUN_STATS['skipped'] += 1
|
||||
else:
|
||||
_LAST_RUN_STATS['failed'] += 1
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
if skipped_entirely:
|
||||
print('\r √ {}{}'.format(
|
||||
pretty_path(link_dir),
|
||||
' (new)' if is_new else '',
|
||||
))
|
||||
|
||||
def log_archive_method_starting(method):
|
||||
def log_archive_method_started(method):
|
||||
print(' > {}'.format(method))
|
||||
|
||||
def log_archive_method_finished(result):
|
||||
|
@ -117,7 +125,7 @@ def log_parsing_finished(num_new_links, parser_name):
|
|||
parser_name,
|
||||
))
|
||||
|
||||
def log_indexing_started():
|
||||
def log_indexing_process_started():
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS['index_start_ts'] = start_ts
|
||||
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
||||
|
@ -125,10 +133,13 @@ def log_indexing_started():
|
|||
**ANSI,
|
||||
))
|
||||
|
||||
def log_indexing_started(out_dir, out_file):
|
||||
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
||||
|
||||
def log_indexing_finished(out_dir, out_file):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS['index_end_ts'] = end_ts
|
||||
print(' √ {}/{}'.format(pretty_path(out_dir), out_file))
|
||||
print('\r √ {}/{}'.format(pretty_path(out_dir), out_file))
|
||||
|
||||
def log_archiving_started(num_links, resume):
|
||||
start_ts = datetime.now()
|
||||
|
|
|
@ -314,10 +314,20 @@ def wget_output_path(link):
|
|||
# Wget downloads can save in a number of different ways depending on the url:
|
||||
# https://example.com
|
||||
# > output/archive/<timestamp>/example.com/index.html
|
||||
# https://example.com?v=zzVa_tX1OiI
|
||||
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
|
||||
# https://www.example.com/?v=zzVa_tX1OiI
|
||||
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc
|
||||
# > output/archive/<timestamp>/example.com/abc.html
|
||||
# https://example.com/abc/
|
||||
# > output/archive/<timestamp>/example.com/abc/index.html
|
||||
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||
# > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||
# > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc/test.html
|
||||
# > output/archive/<timestamp>/example.com/abc/test.html
|
||||
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||
|
@ -326,7 +336,7 @@ def wget_output_path(link):
|
|||
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments or extensions like shtml / htm
|
||||
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
|
||||
|
||||
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
||||
# and there's no way to get the computed output path from wget
|
||||
|
@ -359,27 +369,6 @@ def wget_output_path(link):
|
|||
|
||||
return None
|
||||
|
||||
# If finding the actual output file didn't work, fall back to the buggy
|
||||
# implementation of the wget .html appending algorithm
|
||||
# split_url = link['url'].split('#', 1)
|
||||
# query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
||||
|
||||
# if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
||||
# # already ends in .html
|
||||
# return urlencode(base_url(link['url']))
|
||||
# else:
|
||||
# # .html needs to be appended
|
||||
# without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
||||
# if without_scheme.endswith('/'):
|
||||
# if query:
|
||||
# return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
|
||||
# return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
|
||||
# else:
|
||||
# if query:
|
||||
# return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
|
||||
# elif '/' in without_scheme:
|
||||
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
|
||||
# return urlencode(base_url(link['url']) + '/index.html')
|
||||
|
||||
### String Manipulation & Logging Helpers
|
||||
|
||||
|
|
Loading…
Reference in a new issue