2019-03-27 19:32:39 +00:00
|
|
|
import os
|
2019-03-21 05:28:12 +00:00
|
|
|
import sys
|
|
|
|
|
2019-03-27 19:32:39 +00:00
|
|
|
from datetime import datetime
|
|
|
|
from dataclasses import dataclass
|
2019-04-11 12:11:32 +00:00
|
|
|
from typing import Optional, List
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-04-02 22:53:21 +00:00
|
|
|
from .schema import Link, ArchiveResult
|
|
|
|
from .config import ANSI, OUTPUT_DIR
|
2019-03-27 19:32:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class RuntimeStats:
|
|
|
|
"""mutable stats counter for logging archiving timing info to CLI output"""
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-27 19:32:39 +00:00
|
|
|
skipped: int = 0
|
|
|
|
succeeded: int = 0
|
|
|
|
failed: int = 0
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-27 22:24:30 +00:00
|
|
|
parse_start_ts: Optional[datetime] = None
|
|
|
|
parse_end_ts: Optional[datetime] = None
|
2019-03-27 19:32:39 +00:00
|
|
|
|
2019-03-27 22:24:30 +00:00
|
|
|
index_start_ts: Optional[datetime] = None
|
|
|
|
index_end_ts: Optional[datetime] = None
|
2019-03-27 19:32:39 +00:00
|
|
|
|
2019-03-27 22:24:30 +00:00
|
|
|
archiving_start_ts: Optional[datetime] = None
|
|
|
|
archiving_end_ts: Optional[datetime] = None
|
2019-03-27 19:32:39 +00:00
|
|
|
|
|
|
|
# globals are bad, mmkay
|
|
|
|
_LAST_RUN_STATS = RuntimeStats()
|
2019-03-21 05:28:12 +00:00
|
|
|
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
def pretty_path(path: str) -> str:
|
2019-03-21 05:28:12 +00:00
|
|
|
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
2019-03-27 19:32:39 +00:00
|
|
|
pwd = os.path.abspath('.')
|
|
|
|
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
|
|
|
|
return path.replace(pwd + '/', './')
|
2019-03-21 05:28:12 +00:00
|
|
|
|
|
|
|
|
2019-03-23 01:38:24 +00:00
|
|
|
### Parsing Stage
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
def log_parsing_started(source_file: str):
|
2019-03-21 05:28:12 +00:00
|
|
|
start_ts = datetime.now()
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.parse_start_ts = start_ts
|
2019-03-21 05:28:12 +00:00
|
|
|
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
|
|
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
source_file.rsplit('/', 1)[-1],
|
|
|
|
**ANSI,
|
|
|
|
))
|
|
|
|
|
2019-03-27 19:16:53 +00:00
|
|
|
def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
|
2019-03-26 09:33:34 +00:00
|
|
|
end_ts = datetime.now()
|
|
|
|
_LAST_RUN_STATS.parse_end_ts = end_ts
|
2019-03-27 19:49:41 +00:00
|
|
|
print(' > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links))
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-23 01:38:24 +00:00
|
|
|
|
|
|
|
### Indexing Stage
|
|
|
|
|
2019-03-22 19:09:39 +00:00
|
|
|
def log_indexing_process_started():
|
2019-03-21 05:28:12 +00:00
|
|
|
start_ts = datetime.now()
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.index_start_ts = start_ts
|
2019-04-11 11:00:34 +00:00
|
|
|
print()
|
2019-03-21 05:28:12 +00:00
|
|
|
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
|
|
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
**ANSI,
|
|
|
|
))
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
def log_indexing_started(out_dir: str, out_file: str):
|
2019-03-22 19:09:39 +00:00
|
|
|
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
def log_indexing_finished(out_dir: str, out_file: str):
|
2019-03-21 05:28:12 +00:00
|
|
|
end_ts = datetime.now()
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.index_end_ts = end_ts
|
2019-03-22 19:09:39 +00:00
|
|
|
print('\r √ {}/{}'.format(pretty_path(out_dir), out_file))
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-23 01:38:24 +00:00
|
|
|
|
|
|
|
### Archiving Stage
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
def log_archiving_started(num_links: int, resume: Optional[float]):
|
2019-03-21 05:28:12 +00:00
|
|
|
start_ts = datetime.now()
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.archiving_start_ts = start_ts
|
2019-04-17 03:20:31 +00:00
|
|
|
print()
|
2019-03-21 05:28:12 +00:00
|
|
|
if resume:
|
|
|
|
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
|
|
|
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
num_links,
|
|
|
|
resume,
|
|
|
|
**ANSI,
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
|
|
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
num_links,
|
|
|
|
**ANSI,
|
|
|
|
))
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
def log_archiving_paused(num_links: int, idx: int, timestamp: str):
|
2019-03-21 05:28:12 +00:00
|
|
|
end_ts = datetime.now()
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
2019-03-21 05:28:12 +00:00
|
|
|
print()
|
|
|
|
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
|
|
|
**ANSI,
|
|
|
|
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
idx=idx+1,
|
|
|
|
timestamp=timestamp,
|
|
|
|
total=num_links,
|
|
|
|
))
|
2019-03-27 19:16:53 +00:00
|
|
|
print(' To view your archive, open:')
|
|
|
|
print(' {}/index.html'.format(OUTPUT_DIR))
|
|
|
|
print(' Continue archiving where you left off by running:')
|
|
|
|
print(' archivebox {}'.format(timestamp))
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
def log_archiving_finished(num_links: int):
|
2019-03-21 05:28:12 +00:00
|
|
|
end_ts = datetime.now()
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
2019-03-31 01:29:16 +00:00
|
|
|
assert _LAST_RUN_STATS.archiving_start_ts is not None
|
2019-03-26 09:33:34 +00:00
|
|
|
seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
|
2019-03-21 05:28:12 +00:00
|
|
|
if seconds > 60:
|
|
|
|
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
|
|
|
else:
|
|
|
|
duration = '{0:.2f} sec'.format(seconds, 2)
|
|
|
|
|
2019-04-17 03:20:31 +00:00
|
|
|
print()
|
2019-03-21 05:28:12 +00:00
|
|
|
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
|
|
|
ANSI['green'],
|
|
|
|
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
num_links,
|
|
|
|
duration,
|
|
|
|
ANSI['reset'],
|
|
|
|
))
|
2019-03-26 09:33:34 +00:00
|
|
|
print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
|
|
|
|
print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded))
|
|
|
|
print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
|
2019-03-27 19:16:53 +00:00
|
|
|
print(' To view your archive, open:')
|
|
|
|
print(' {}/index.html'.format(OUTPUT_DIR))
|
2019-03-23 01:38:24 +00:00
|
|
|
|
|
|
|
|
2019-03-27 22:24:30 +00:00
|
|
|
def log_link_archiving_started(link: Link, link_dir: str, is_new: bool):
|
2019-03-23 01:38:24 +00:00
|
|
|
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
|
|
|
|
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
|
|
|
|
# > output/archive/1478739709
|
|
|
|
|
|
|
|
print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
|
|
|
|
symbol_color=ANSI['green' if is_new else 'black'],
|
2019-03-27 07:49:39 +00:00
|
|
|
symbol='+' if is_new else '√',
|
2019-03-23 01:38:24 +00:00
|
|
|
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
2019-03-26 23:21:34 +00:00
|
|
|
title=link.title or link.base_url,
|
2019-03-23 01:38:24 +00:00
|
|
|
**ANSI,
|
|
|
|
))
|
2019-03-26 23:21:34 +00:00
|
|
|
print(' {blue}{url}{reset}'.format(url=link.url, **ANSI))
|
2019-03-23 03:00:43 +00:00
|
|
|
print(' {} {}'.format(
|
|
|
|
'>' if is_new else '√',
|
2019-03-23 01:38:24 +00:00
|
|
|
pretty_path(link_dir),
|
|
|
|
))
|
|
|
|
|
2019-03-27 22:24:30 +00:00
|
|
|
def log_link_archiving_finished(link: Link, link_dir: str, is_new: bool, stats: dict):
|
2019-03-23 03:00:43 +00:00
|
|
|
total = sum(stats.values())
|
|
|
|
|
|
|
|
if stats['failed'] > 0 :
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.failed += 1
|
2019-03-23 03:00:43 +00:00
|
|
|
elif stats['skipped'] == total:
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.skipped += 1
|
2019-03-23 03:00:43 +00:00
|
|
|
else:
|
2019-03-26 09:33:34 +00:00
|
|
|
_LAST_RUN_STATS.succeeded += 1
|
2019-03-23 01:38:24 +00:00
|
|
|
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
def log_archive_method_started(method: str):
|
2019-03-23 01:38:24 +00:00
|
|
|
print(' > {}'.format(method))
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
|
|
|
|
def log_archive_method_finished(result: ArchiveResult):
|
2019-03-23 01:38:24 +00:00
|
|
|
"""quote the argument with whitespace in a command so the user can
|
|
|
|
copy-paste the outputted string directly to run the cmd
|
|
|
|
"""
|
|
|
|
# Prettify CMD string and make it safe to copy-paste by quoting arguments
|
|
|
|
quoted_cmd = ' '.join(
|
|
|
|
'"{}"'.format(arg) if ' ' in arg else arg
|
2019-03-26 09:33:34 +00:00
|
|
|
for arg in result.cmd
|
2019-03-23 01:38:24 +00:00
|
|
|
)
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
if result.status == 'failed':
|
2019-03-23 01:38:24 +00:00
|
|
|
# Prettify error output hints string and limit to five lines
|
2019-03-26 09:33:34 +00:00
|
|
|
hints = getattr(result.output, 'hints', None) or ()
|
2019-03-23 01:38:24 +00:00
|
|
|
if hints:
|
|
|
|
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
|
|
|
hints = (
|
|
|
|
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
|
|
|
for line in hints[:5] if line.strip()
|
|
|
|
)
|
|
|
|
|
|
|
|
# Collect and prefix output lines with indentation
|
|
|
|
output_lines = [
|
2019-03-27 19:33:12 +00:00
|
|
|
'{lightred}Failed:{reset}'.format(**ANSI),
|
|
|
|
' {reset}{} {red}{}{reset}'.format(
|
|
|
|
result.output.__class__.__name__.replace('ArchiveError', ''),
|
|
|
|
result.output,
|
|
|
|
**ANSI,
|
2019-03-23 01:38:24 +00:00
|
|
|
),
|
|
|
|
*hints,
|
|
|
|
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
|
2019-03-31 01:29:16 +00:00
|
|
|
*([' cd {};'.format(result.pwd)] if result.pwd else []),
|
2019-03-23 01:38:24 +00:00
|
|
|
' {}'.format(quoted_cmd),
|
|
|
|
]
|
|
|
|
print('\n'.join(
|
|
|
|
' {}'.format(line)
|
|
|
|
for line in output_lines
|
|
|
|
if line
|
|
|
|
))
|
2019-03-27 19:33:12 +00:00
|
|
|
print()
|
2019-04-11 12:11:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
def log_list_started(filter_patterns: List[str], filter_type: str):
|
|
|
|
print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
|
|
|
|
filter_type,
|
|
|
|
**ANSI,
|
|
|
|
))
|
|
|
|
print(' {}'.format(' '.join(filter_patterns)))
|
|
|
|
|
|
|
|
def log_list_finished(links):
|
|
|
|
from .util import to_csv
|
|
|
|
print()
|
|
|
|
print('---------------------------------------------------------------------------------------------------')
|
|
|
|
print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
|
|
|
print('---------------------------------------------------------------------------------------------------')
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
def log_removal_started(links: List[Link], yes: bool, delete: bool):
|
|
|
|
print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
|
|
|
|
if delete:
|
|
|
|
file_counts = [link.num_outputs for link in links if os.path.exists(link.link_dir)]
|
|
|
|
print(
|
|
|
|
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
|
|
|
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
print(
|
|
|
|
f' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
|
|
|
f' (Pass --delete if you also want to permanently delete the data folders)'
|
|
|
|
)
|
|
|
|
|
|
|
|
if not yes:
|
|
|
|
print()
|
|
|
|
print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI))
|
|
|
|
try:
|
|
|
|
assert input(' y/[n]: ').lower() == 'y'
|
|
|
|
except (KeyboardInterrupt, EOFError, AssertionError):
|
|
|
|
raise SystemExit(0)
|
|
|
|
|
|
|
|
def log_removal_finished(all_links: int, to_keep: int):
|
|
|
|
if all_links == 0:
|
|
|
|
print()
|
|
|
|
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
|
|
|
|
else:
|
|
|
|
num_removed = all_links - to_keep
|
|
|
|
print()
|
|
|
|
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
|
|
|
|
num_removed,
|
|
|
|
all_links,
|
|
|
|
**ANSI,
|
|
|
|
))
|
|
|
|
print(' Index now contains {} links.'.format(to_keep))
|