mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 20:23:12 +00:00
better progress output
This commit is contained in:
parent
33ba29ea90
commit
56d382235f
5 changed files with 28 additions and 29 deletions
|
@ -25,8 +25,10 @@ from config import (
|
|||
ONLY_NEW,
|
||||
OUTPUT_PERMISSIONS,
|
||||
OUTPUT_DIR,
|
||||
REPO_DIR,
|
||||
ANSI,
|
||||
TIMEOUT,
|
||||
SHOW_PROGRESS,
|
||||
GIT_SHA,
|
||||
)
|
||||
from util import (
|
||||
|
@ -69,21 +71,13 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
|
|||
all_links = validate_links(existing_links + all_links)
|
||||
|
||||
num_new_links = len(all_links) - len(existing_links)
|
||||
if num_new_links and not only_new:
|
||||
print('{green}[+] [{}] Adding {} new links to index from {} ({} format){reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_new_links,
|
||||
pretty_path(import_path),
|
||||
parser_name,
|
||||
**ANSI,
|
||||
))
|
||||
# else:
|
||||
# print('[*] [{}] No new links added to {}/index.json{}'.format(
|
||||
# datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# archive_path,
|
||||
# ' from {}'.format(import_path) if import_path else '',
|
||||
# **ANSI,
|
||||
# ))
|
||||
if SHOW_PROGRESS:
|
||||
print()
|
||||
print(' > Adding {} new links to index from {} (parsed as {} format)'.format(
|
||||
num_new_links,
|
||||
pretty_path(import_path),
|
||||
parser_name,
|
||||
))
|
||||
|
||||
if only_new:
|
||||
return new_links(all_links, existing_links)
|
||||
|
@ -102,7 +96,7 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
|
|||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Downloading content for {} pages in archive...{reset}'.format(
|
||||
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(links),
|
||||
**ANSI,
|
||||
|
@ -119,7 +113,7 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
|
|||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds, 2)
|
||||
|
||||
print('{}[√] [{}] Update of {} links complete ({}){}'.format(
|
||||
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(links),
|
||||
|
@ -129,6 +123,7 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
|
|||
print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
|
||||
print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
|
||||
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -134,8 +134,8 @@ def log_link_archive(link_dir, link, update_existing):
|
|||
))
|
||||
|
||||
print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)'))
|
||||
if link['type']:
|
||||
print(' i {}'.format(link['type']))
|
||||
# if link['type']:
|
||||
# print(' i {}'.format(link['type']))
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -28,14 +28,16 @@ def write_links_index(out_dir, links):
|
|||
if not os.path.exists(out_dir):
|
||||
os.makedirs(out_dir)
|
||||
|
||||
write_json_links_index(out_dir, links)
|
||||
write_html_links_index(out_dir, links)
|
||||
|
||||
print('{green}[√] [{}] Updated main index files:{reset}'.format(
|
||||
print('{green}[*] [{}] Updating main index files...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
**ANSI))
|
||||
**ANSI,
|
||||
))
|
||||
write_json_links_index(out_dir, links)
|
||||
print(' > {}/index.json'.format(pretty_path(out_dir)))
|
||||
|
||||
write_html_links_index(out_dir, links)
|
||||
print(' > {}/index.html'.format(pretty_path(out_dir)))
|
||||
|
||||
|
||||
def write_json_links_index(out_dir, links):
|
||||
"""write the json link index to a given path"""
|
||||
|
|
|
@ -18,6 +18,7 @@ Parsed link schema: {
|
|||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import urllib
|
||||
from collections import OrderedDict
|
||||
|
@ -25,7 +26,7 @@ import xml.etree.ElementTree as etree
|
|||
|
||||
from datetime import datetime
|
||||
|
||||
from config import ANSI
|
||||
from config import ANSI, SHOW_PROGRESS
|
||||
from util import (
|
||||
domain,
|
||||
base_url,
|
||||
|
@ -60,6 +61,8 @@ def parse_links(path):
|
|||
path.rsplit('/', 1)[-1],
|
||||
**ANSI,
|
||||
))
|
||||
if SHOW_PROGRESS:
|
||||
sys.stdout.write(' ')
|
||||
|
||||
for parser_name, parser_func in get_parsers(file).items():
|
||||
# otherwise try all parsers until one works
|
||||
|
@ -72,8 +75,6 @@ def parse_links(path):
|
|||
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
||||
pass
|
||||
|
||||
print()
|
||||
|
||||
return links, parser_name
|
||||
|
||||
|
||||
|
|
|
@ -233,8 +233,9 @@ def fetch_page_title(url, default=True):
|
|||
default = url
|
||||
|
||||
try:
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
if SHOW_PROGRESS:
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
|
||||
match = re.search('<title>(.*?)</title>', html_content)
|
||||
return match.group(1) if match else default or None
|
||||
|
|
Loading…
Reference in a new issue