major codebase-wide code cleanups

This commit is contained in:
Nick Sweeting 2019-03-21 01:28:12 -04:00
parent c806068683
commit e6bd1f8ca8
8 changed files with 825 additions and 743 deletions

View file

@ -1,225 +1,132 @@
#!/usr/bin/env python3
# ArchiveBox
# Nick Sweeting 2017 | MIT License
# https://github.com/pirate/ArchiveBox
"""
ArchiveBox command line application.
./archive and ./bin/archivebox both point to this file,
but you can also run it directly using `python3 archive.py`
Usage & Documentation:
https://github.com/pirate/ArchiveBox/Wiki
"""
import os
import sys
from datetime import datetime
from peekable import Peekable
from parse import parse_links
from links import validate_links, links_after_timestamp
from archive_methods import archive_link, _RESULTS_TOTALS
from index import (
write_links_index,
parse_json_links_index,
)
from links import links_after_timestamp
from index import write_links_index, load_links_index
from archive_methods import archive_link
from config import (
ARCHIVE_DIR,
ONLY_NEW,
OUTPUT_DIR,
REPO_DIR,
ANSI,
GIT_SHA,
)
from util import (
check_dependencies,
save_remote_source,
save_stdin_source,
pretty_path,
check_links_structure,
)
from logs import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
)
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__VERSION__ = GIT_SHA
__DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
def print_help():
print(__DESCRIPTION__)
print("Documentation: {}\n".format(__DOCUMENTATION__))
print('ArchiveBox: The self-hosted internet archive.\n')
print("Documentation:")
print(" https://github.com/pirate/ArchiveBox/wiki\n")
print("Usage:")
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
print("")
print(" ./bin/archivebox https://example.com/feed.rss\n")
print("")
print(" echo 'https://examplecom' | ./bin/archivebox\n")
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
print(" ./bin/archivebox https://example.com/feed.rss\n")
print(" ./bin/archivebox 15109948213.123\n")
def load_links(archive_path=OUTPUT_DIR, import_path=None):
"""get new links from file and optionally append them to links in existing archive"""
existing_links = []
if archive_path:
existing_links = parse_json_links_index(archive_path)
check_links_structure(existing_links)
new_links = []
if import_path:
# parse and validate the import file
raw_links, parser_name = parse_links(import_path)
new_links = validate_links(raw_links)
check_links_structure(new_links)
# merge existing links in archive_path and new links
all_links = validate_links(existing_links + new_links)
check_links_structure(all_links)
num_new_links = len(all_links) - len(existing_links)
if import_path and parser_name:
print(' > Adding {} new links to index (parsed import as {})'.format(
num_new_links,
parser_name,
))
return all_links, new_links
def update_archive(archive_path, links, source=None, resume=None, append=True):
"""update or create index.html+json given a path to an export file containing new links"""
start_ts = datetime.now().timestamp()
if resume:
print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
len(links),
**ANSI,
))
check_links_structure(links)
# prefetch the first link off the generator so that if we pause or fail
# immediately we can show that we paused on the first link and not just None
to_archive = Peekable(links_after_timestamp(links, resume))
idx, link = 0, to_archive.peek(0)
# loop over links and archive them
try:
check_dependencies()
for idx, link in enumerate(to_archive):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except (KeyboardInterrupt, SystemExit, Exception) as e:
# if isinstance(e, KeyboardInterrupt):
# # Step 4: Re-write links index with updated titles, icons, and resources
# all_links, _ = load_links(archive_path=out_dir)
# write_links_index(out_dir=out_dir, links=all_links, finished=True)
print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=link['timestamp'],
total=len(links),
))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
print(' Continue where you left off by running:')
print(' {} {}'.format(
pretty_path(sys.argv[0]),
link['timestamp'],
))
if not isinstance(e, KeyboardInterrupt):
print()
raise e
raise SystemExit(1)
# print timing information & summary
end_ts = datetime.now().timestamp()
seconds = end_ts - start_ts
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2)
else:
duration = '{0:.2f} sec'.format(seconds, 2)
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
len(links),
duration,
ANSI['reset'],
))
print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
if __name__ == '__main__':
argc = len(sys.argv)
if set(sys.argv).intersection(('-h', '--help', 'help')):
def main(*args):
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
print_help()
raise SystemExit(0)
source = sys.argv[1] if argc > 1 else None # path of links file to import
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
stdin_raw_text = ''
### Handle CLI arguments
# ./archive bookmarks.html
# ./archive 1523422111.234
import_path, resume = None, None
if len(args) == 2:
# if the argument is a string, it's a import_path file to import
# if it's a number, it's a timestamp to resume archiving from
if args[1].replace('.', '').isdigit():
import_path, resume = None, args[1]
else:
import_path, resume = args[1], None
### Set up output folder
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive)
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and import_path:
print(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
print_help()
raise SystemExit(1)
if source and stdin_raw_text:
print(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
print_help()
raise SystemExit(1)
import_path = save_stdin_source(stdin_raw_text)
### Handle ingesting urls from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
import_path = save_remote_source(import_path)
### Run the main archive update process
update_archive_data(import_path=import_path, resume=resume)
if argc == 1:
source, resume = None, None
elif argc == 2:
if all(d.isdigit() for d in sys.argv[1].split('.')):
# argv[1] is a resume timestamp
source, resume = None, sys.argv[1]
else:
# argv[1] is a path to a file to import
source, resume = sys.argv[1].strip(), None
elif argc == 3:
source, resume = sys.argv[1].strip(), sys.argv[2]
else:
print_help()
raise SystemExit(1)
def update_archive_data(import_path=None, resume=None):
"""The main ArchiveBox entrancepoint. Everything starts here."""
check_dependencies()
# See if archive folder already exists
for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
if os.path.exists(out_dir):
break
else:
out_dir = OUTPUT_DIR
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
source = save_remote_source(source)
elif stdin_raw_text:
source = save_stdin_source(stdin_raw_text)
# Step 1: Parse the links and dedupe them with existing archive
all_links, new_links = load_links(archive_path=out_dir, import_path=source)
# Step 2: Write new index
write_links_index(out_dir=out_dir, links=all_links)
# Step 2: Write updated index with deduped old and new links back to disk
write_links_index(out_dir=OUTPUT_DIR, links=all_links)
# Step 3: Run the archive methods for each link
if ONLY_NEW:
update_archive(out_dir, new_links, source=source, resume=resume, append=True)
else:
update_archive(out_dir, all_links, source=source, resume=resume, append=True)
links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume)
idx, link = 0, 0
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link and link['timestamp'])
raise SystemExit(0)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links(archive_path=out_dir)
write_links_index(out_dir=out_dir, links=all_links, finished=True)
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
if __name__ == '__main__':
main(*sys.argv)

View file

@ -3,18 +3,18 @@ import os
from functools import wraps
from collections import defaultdict
from datetime import datetime
from stdlib_patches import run, PIPE, DEVNULL
from index import (
parse_json_link_index,
write_link_index,
update_main_index,
patch_links_index,
load_json_link_index,
)
from config import (
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
CHROME_BINARY,
FETCH_FAVICON,
FETCH_TITLE,
FETCH_WGET,
@ -25,62 +25,37 @@ from config import (
FETCH_WARC,
FETCH_GIT,
FETCH_MEDIA,
RESOLUTION,
CHECK_SSL_VALIDITY,
SUBMIT_ARCHIVE_DOT_ORG,
COOKIES_FILE,
WGET_USER_AGENT,
CHROME_USER_AGENT,
CHROME_USER_DATA_DIR,
CHROME_HEADLESS,
CHROME_SANDBOX,
TIMEOUT,
MEDIA_TIMEOUT,
ANSI,
ARCHIVE_DIR,
OUTPUT_DIR,
GIT_DOMAINS,
GIT_SHA,
WGET_USER_AGENT,
CHECK_SSL_VALIDITY,
COOKIES_FILE,
)
from util import (
domain,
extension,
without_query,
without_fragment,
fetch_page_title,
is_static_file,
progress,
chmod_file,
pretty_path,
print_error_hints,
check_link_structure,
wget_output_path,
run, PIPE, DEVNULL,
chrome_args,
)
from logs import (
_LAST_RUN_STATS,
log_link_archiving_started,
log_link_archiving_failed,
)
_RESULTS_TOTALS = { # globals are bad, mmkay
'skipped': 0,
'succeded': 0,
'failed': 0,
}
def load_link_index(link_dir, link):
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
is_new = not os.path.exists(link_dir)
if is_new:
os.makedirs(link_dir)
else:
link = {
**parse_json_link_index(link_dir),
**link,
}
check_link_structure(link)
print_link_status_line(link_dir, link, is_new)
return link
class ArchiveError(Exception):
def __init__(self, message, hints=None):
@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True):
active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
try:
link = load_link_index(link_dir, link)
is_new = not os.path.exists(link_dir)
if is_new:
os.makedirs(link_dir)
link = load_json_link_index(link_dir, link)
log_link_archiving_started(link_dir, link, is_new)
for archive_method in active_methods:
archive_method(link_dir, link, overwrite=overwrite)
write_link_index(link_dir, link)
update_main_index(link)
patch_links_index(link)
except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
return link
def print_link_status_line(link_dir, link, is_new):
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
symbol='+' if is_new else '*',
symbol_color=ANSI['green' if is_new else 'black'],
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**{**link, 'title': link['title'] or link['url']},
**ANSI,
))
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
def attach_result_to_link(method):
"""
@ -178,15 +145,75 @@ def attach_result_to_link(method):
link['history'][method].append(history_entry)
link['latest'][method] = result['output']
_RESULTS_TOTALS[history_entry['status']] += 1
_LAST_RUN_STATS[history_entry['status']] += 1
return link
return timed_fetch_func
return decorator
@attach_result_to_link('title')
def fetch_title(link_dir, link, timeout=TIMEOUT):
"""try to guess the page's title from its content"""
# if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'):
return {'output': link['title'], 'status': 'skipped'}
if is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
end = progress(timeout, prefix=' ')
try:
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
end()
output = title
except Exception as e:
end()
output = e
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
if title and title.strip():
link['title'] = title
output = title
return {
'cmd': 'fetch_page_title("{}")'.format(link['url']),
'output': output,
}
@attach_result_to_link('favicon')
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
"""download site favicon from google's favicon api"""
output = 'favicon.ico'
if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'}
CMD = [
CURL_BINARY,
'--max-time', str(timeout),
'--location',
'--output', output,
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
chmod_file(output, cwd=link_dir)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('wget')
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
def fetch_wget(link_dir, link, timeout=TIMEOUT):
"""download full site using wget"""
domain_dir = os.path.join(link_dir, domain(link['url']))
@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
if os.path.exists(domain_dir) and existing_file:
return {'output': existing_file, 'status': 'skipped'}
if warc:
if FETCH_WARC:
warc_dir = os.path.join(link_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
'-e', 'robots=off',
'--restrict-file-names=unix',
'--timeout={}'.format(timeout),
*(() if warc else ('--timestamping',)),
*(('--warc-file={}'.format(warc_path),) if warc else ()),
*(() if FETCH_WARC else ('--timestamping',)),
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
if line.strip()
]
# parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1]
@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
'output': output,
}
@attach_result_to_link('pdf')
def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
def fetch_pdf(link_dir, link, timeout=TIMEOUT):
"""print PDF of site to file using chrome --headless"""
if is_static_file(link['url']):
return {'output': wget_output_path(link), 'status': 'skipped'}
return {'output': None, 'status': 'skipped'}
output = 'output.pdf'
if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'}
CMD = [
*chrome_headless(timeout=timeout, **chrome_kwargs),
*chrome_args(timeout=timeout),
'--print-to-pdf',
link['url']
]
@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
}
@attach_result_to_link('screenshot')
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
"""take screenshot of site using chrome --headless"""
if is_static_file(link['url']):
return {'output': wget_output_path(link), 'status': 'skipped'}
return {'output': None, 'status': 'skipped'}
output = 'screenshot.png'
if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'}
CMD = [
*chrome_headless(timeout=timeout, **chrome_kwargs),
*chrome_args(timeout=timeout),
'--screenshot',
link['url'],
]
@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
}
@attach_result_to_link('dom')
def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
def fetch_dom(link_dir, link, timeout=TIMEOUT):
"""print HTML of site to file using chrome --dump-html"""
if is_static_file(link['url']):
return {'output': wget_output_path(link), 'status': 'skipped'}
return {'output': None, 'status': 'skipped'}
output = 'output.html'
if os.path.exists(os.path.join(link_dir, output)):
output_path = os.path.join(link_dir, output)
if os.path.exists(output_path):
return {'output': output, 'status': 'skipped'}
CMD = [
*chrome_headless(timeout=timeout, **chrome_kwargs),
*chrome_args(timeout=timeout),
'--dump-dom',
link['url']
]
@ -372,6 +400,116 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
'output': output,
}
@attach_result_to_link('git')
def fetch_git(link_dir, link, timeout=TIMEOUT):
"""download full site using git"""
is_clonable_url = (
domain(link['url']) in GIT_DOMAINS
or extension(link['url']) == 'git'
)
if is_static_file(link['url']) or not is_clonable_url:
return {'output': None, 'status': 'skipped'}
output = 'git'
output_path = os.path.join(link_dir, 'git')
if os.path.exists(output_path):
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
GIT_BINARY,
'clone',
'--mirror',
'--recursive',
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
without_query(without_fragment(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
end()
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'got git response code {}:'.format(result.returncode)
raise ArchiveError('Failed git download', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('media')
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media'
output_path = os.path.join(link_dir, 'media')
if os.path.exists(output_path) and not overwrite:
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--yes-playlist',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--user-agent',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
link['url'],
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir)
end()
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'got youtubedl response code {}:'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
def parse_archive_dot_org_response(response):
# Parse archive.org response headers
headers = defaultdict(list)
@ -445,226 +583,4 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
'output': output,
}
@attach_result_to_link('favicon')
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
"""download site favicon from google's favicon api"""
output = 'favicon.ico'
if os.path.exists(os.path.join(link_dir, output)):
return {'output': output, 'status': 'skipped'}
CMD = [
CURL_BINARY,
'--max-time', str(timeout),
'--location',
'--output', output,
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
end()
chmod_file(output, cwd=link_dir)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('title')
def fetch_title(link_dir, link, timeout=TIMEOUT):
"""try to guess the page's title from its content"""
# if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'):
return {'output': link['title'], 'status': 'skipped'}
if is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
end = progress(timeout, prefix=' ')
try:
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
end()
output = title
except Exception as e:
end()
output = e
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
if title and title.strip():
link['title'] = title
output = title
return {
'cmd': 'fetch_page_title("{}")'.format(link['url']),
'output': output,
}
@attach_result_to_link('media')
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media'
output_path = os.path.join(link_dir, 'media')
if os.path.exists(output_path) and not overwrite:
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--yes-playlist',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--user-agent',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
link['url'],
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir)
end()
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'got youtubedl response code {}:'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('git')
def fetch_git(link_dir, link, timeout=TIMEOUT):
"""download full site using git"""
url_is_clonable = (
domain(link['url']) in GIT_DOMAINS
or link['url'].endswith('.git')
)
if not url_is_clonable or is_static_file(link['url']):
return {'output': None, 'status': 'skipped'}
output = 'git'
output_path = os.path.join(link_dir, 'git')
if os.path.exists(output_path):
return {'output': output, 'status': 'skipped'}
os.makedirs(output_path, exist_ok=True)
CMD = [
GIT_BINARY,
'clone',
'--mirror',
'--recursive',
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
without_query(without_fragment(link['url'])),
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
end()
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'got git response code {}:'.format(result.returncode)
raise ArchiveError('Failed git download', hints)
except Exception as e:
end()
output = e
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
return {
'cmd': CMD,
'output': output,
}
def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT):
global CACHED_USER_DATA_DIR
user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
cmd_args = [binary]
if headless:
cmd_args += ('--headless',)
if not sandbox:
# dont use GPU or sandbox when running inside docker container
cmd_args += ('--no-sandbox', '--disable-gpu')
if not check_ssl_validity:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if user_agent:
cmd_args += ('--user-agent={}'.format(user_agent),)
if resolution:
cmd_args += ('--window-size={}'.format(RESOLUTION),)
if timeout:
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
# Find chrome user data directory
default_profile_paths = (
'~/.config/chromium',
'~/.config/google-chrome',
'~/.config/google-chrome-beta',
'~/.config/google-chrome-unstable',
'~/Library/Application Support/Chromium',
'~/Library/Application Support/Google/Chrome',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Chromium/User Data',
'~/AppData/Local/Google/Chrome/User Data',
'~/AppData/Local/Google/Chrome SxS/User Data',
)
if user_data_dir:
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
else:
for path in default_profile_paths:
full_path = os.path.expanduser(path)
if os.path.exists(full_path):
CACHED_USER_DATA_DIR = full_path
cmd_args.append('--user-data-dir={}'.format(full_path))
break
return cmd_args
CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR

View file

@ -12,18 +12,24 @@ except ImportError:
from config import (
OUTPUT_DIR,
TEMPLATES_DIR,
ANSI,
GIT_SHA,
FOOTER_INFO,
)
from util import (
chmod_file,
derived_link_info,
pretty_path,
check_link_structure,
check_links_structure,
wget_output_path,
)
from parse import parse_links
from links import validate_links
from logs import (
log_indexing_started,
log_indexing_finished,
log_parsing_started,
log_parsing_finished,
)
TITLE_LOADING_MSG = 'Not yet archived...'
@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...'
def write_links_index(out_dir, links, finished=False):
"""create index.html file for a given list of links"""
log_indexing_started()
check_links_structure(links)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
print('{green}[*] [{}] Saving main index files...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**ANSI,
))
write_json_links_index(out_dir, links)
print(' > {}/index.json'.format(pretty_path(out_dir)))
log_indexing_finished(out_dir, 'index.json')
write_html_links_index(out_dir, links, finished=finished)
print(' > {}/index.html'.format(pretty_path(out_dir)))
log_indexing_finished(out_dir, 'index.html')
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
"""parse and load existing index with any new links from import_path merged in"""
existing_links = []
if out_dir:
existing_links = parse_json_links_index(out_dir)
check_links_structure(existing_links)
new_links = []
if import_path:
# parse and validate the import file
log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path)
new_links = validate_links(raw_links)
check_links_structure(new_links)
# merge existing links in out_dir and new links
all_links = validate_links(existing_links + new_links)
check_links_structure(all_links)
num_new_links = len(all_links) - len(existing_links)
if import_path and parser_name:
log_parsing_finished(num_new_links, parser_name)
return all_links, new_links
def write_json_links_index(out_dir, links):
"""write the json link index to a given path"""
@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links):
chmod_file(path)
def parse_json_links_index(out_dir):
"""load the index in a given directory and merge it with the given link"""
def parse_json_links_index(out_dir=OUTPUT_DIR):
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False):
chmod_file(path)
def update_main_index(link):
def patch_links_index(link, out_dir=OUTPUT_DIR):
"""hack to in-place update one row's info in the generated index html"""
title = link['latest']['title']
successful = len([entry for entry in link['latest'].values() if entry])
# Patch JSON index
json_path = os.path.join(OUTPUT_DIR, 'index.json')
links = parse_json_links_index(OUTPUT_DIR)
changed = False
for json_link in links:
if json_link['url'] == link['url']:
json_link['title'] = title
json_link['latest'] = link['latest']
json_file_links = parse_json_links_index(out_dir)
for saved_link in json_file_links:
if saved_link['url'] == link['url']:
saved_link['title'] = title
saved_link['latest'] = link['latest']
changed = True
break
if changed:
write_json_links_index(OUTPUT_DIR, links)
write_json_links_index(out_dir, json_file_links)
# Patch HTML index
html_path = os.path.join(OUTPUT_DIR, 'index.html')
html_path = os.path.join(out_dir, 'index.html')
html = open(html_path, 'r').read().split('\n')
for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
@ -172,6 +192,7 @@ def update_main_index(link):
with open(html_path, 'w') as f:
f.write('\n'.join(html))
### Individual link index
def write_link_index(out_dir, link):
@ -202,6 +223,18 @@ def parse_json_link_index(out_dir):
return link_json
return {}
def load_json_link_index(out_dir, link):
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
link = {
**parse_json_link_index(out_dir),
**link,
}
check_link_structure(link)
return link
def write_html_link_index(out_dir, link):
check_link_structure(link)
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
@ -224,7 +257,10 @@ def write_html_link_index(out_dir, link):
wget_output_path(link)
or (link['domain'] if link['is_archived'] else 'about:blank')
),
'extension': link['extension'] or 'HTML',
'extension': link['extension'] or 'html',
'tags': link['tags'].strip() or 'untagged',
'status': 'Archived' if link['is_archived'] else 'Not yet archived',
'status_color': 'success' if link['is_archived'] else 'danger',
}))
chmod_file(path)

161
archivebox/logs.py Normal file
View file

@ -0,0 +1,161 @@
import sys
from datetime import datetime
from config import ANSI, REPO_DIR, OUTPUT_DIR
# globals are bad, mmkay
_LAST_RUN_STATS = {
'skipped': 0,
'succeded': 0,
'failed': 0,
'parsing_start_ts': 0,
'parsing_end_ts': 0,
'indexing_start_ts': 0,
'indexing_end_ts': 0,
'archiving_start_ts': 0,
'archiving_end_ts': 0,
'links': {},
}
def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '')
def log_link_archiving_started(link_dir, link, is_new):
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
symbol='+' if is_new else '*',
symbol_color=ANSI['green' if is_new else 'black'],
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**{**link, 'title': link['title'] or link['url']},
**ANSI,
))
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix=' '):
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it save to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in cmd
)
# Prettify error output hints string and limit to five lines
hints = hints or getattr(err, 'hints', None)
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
else:
hints = ()
output_lines = [
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
*hints,
'Run to see full output:'
' cd {};'.format(pwd),
' {}'.format(quoted_cmd),
]
return '\n'.join(
'{}{}'.format(prefix, line)
for line in output_lines
if line
)
### Logging Helpers
def log_parsing_started(source_file):
start_ts = datetime.now()
_LAST_RUN_STATS['parse_start_ts'] = start_ts
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1],
**ANSI,
))
def log_parsing_finished(num_new_links, parser_name):
print(' > Adding {} new links to index (parsed import as {})'.format(
num_new_links,
parser_name,
))
def log_indexing_started():
start_ts = datetime.now()
_LAST_RUN_STATS['index_start_ts'] = start_ts
print('{green}[*] [{}] Saving main index files...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
**ANSI,
))
def log_indexing_finished(out_dir, out_file):
end_ts = datetime.now()
_LAST_RUN_STATS['index_end_ts'] = end_ts
print(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_archiving_started(num_links, resume):
start_ts = datetime.now()
_LAST_RUN_STATS['start_ts'] = start_ts
if resume:
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
))
def log_archiving_paused(num_links, idx, timestamp):
end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts
print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=timestamp,
total=num_links,
))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
print(' Continue where you left off by running:')
print(' {} {}'.format(
pretty_path(sys.argv[0]),
timestamp,
))
def log_archiving_finished(num_links):
end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts
seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp()
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2)
else:
duration = '{0:.2f} sec'.format(seconds, 2)
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
ANSI['green'],
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
duration,
ANSI['reset'],
))
print(' - {} entries skipped'.format(_LAST_RUN_STATS['skipped']))
print(' - {} entries updated'.format(_LAST_RUN_STATS['succeded']))
print(' - {} errors'.format(_LAST_RUN_STATS['failed']))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))

View file

@ -1,17 +1,19 @@
# coding: utf-8
"""
Everything related to parsing links from bookmark services.
Everything related to parsing links from input sources.
For a list of supported services, see the README.md.
For examples of supported files see examples/.
For examples of supported import formats see tests/.
Parsed link schema: {
Link: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'timestamp': '15442123124234',
'timestamp': '1544212312.4234',
'title': 'Example.com Page Title',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
'tags': 'abc,def',
'sources': [
'output/sources/ril_export.html',
'output/sources/getpocket.com-1523422111.txt',
'output/sources/stdin-234234112312.txt'
]
}
"""
@ -19,45 +21,59 @@ import re
import json
from datetime import datetime
from collections import OrderedDict
import xml.etree.ElementTree as etree
from config import ANSI
from config import TIMEOUT
from util import (
str_between,
URL_REGEX,
check_url_parsing,
check_url_parsing_invariants,
progress,
)
def parse_links(path):
"""parse a list of links dictionaries from a bookmark export file"""
check_url_parsing()
def parse_links(source_file):
"""parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file
"""
links = []
with open(path, 'r', encoding='utf-8') as file:
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
path.rsplit('/', 1)[-1],
**ANSI,
))
check_url_parsing_invariants()
PARSERS = (
# Specialized parsers
('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
# General parsers
('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_rss_export),
('Generic JSON', parse_json_export),
for parser_name, parser_func in PARSERS.items():
# Fallback parser
('Plain Text', parse_plain_text_export),
)
end = progress(TIMEOUT * 4, prefix=' ')
with open(source_file, 'r', encoding='utf-8') as file:
for parser_name, parser_func in PARSERS:
try:
links += list(parser_func(file))
links = list(parser_func(file))
if links:
break
end()
return links, parser_name
except Exception as err:
# we try each parser one by one, wong parsers will throw exeptions
# if unsupported and we accept the first one that passes
# uncomment the following line to see why the parser was unsupported for each attempted format
# Parsers are tried one by one down the list, and the first one
# that succeeds is used. To see why a certain parser was not used
# due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
pass
return links, parser_name
end()
return [], 'Plain Text'
### Import Parser Functions
def parse_pocket_html_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file):
'sources': [html_file.name],
}
def parse_pinboard_json_export(json_file):
def parse_json_export(json_file):
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0)
json_content = json.load(json_file)
for line in json_content:
links = json.load(json_file)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if line:
erg = line
if erg.get('timestamp'):
timestamp = str(erg['timestamp']/10000000) # chrome/ff histories use a very precise timestamp
elif erg.get('time'):
timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
elif erg.get('created_at'):
timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
else:
timestamp = str(datetime.now().timestamp())
if erg.get('href'):
url = erg['href']
else:
url = erg['url']
if erg.get('description'):
title = (erg.get('description') or '').replace(' — Readability', '')
else:
title = erg['title'].strip()
if link:
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
info = {
# Parse the timestamp
ts_str = str(datetime.now().timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip() or None
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip() or None
elif link.get('name'):
title = link['name'].strip() or None
yield {
'url': url,
'timestamp': timestamp,
'title': title or None,
'tags': erg.get('tags') or '',
'timestamp': ts_str,
'title': title,
'tags': link.get('tags') or '',
'sources': [json_file.name],
}
yield info
def parse_rss_export(rss_file):
@ -139,15 +172,15 @@ def parse_rss_export(rss_file):
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': title or None,
'title': title,
'tags': '',
'sources': [rss_file.name],
}
@ -224,9 +257,6 @@ def parse_pinboard_rss_export(rss_file):
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
# = 🌈🌈🌈🌈
# = 🌈🌈🌈🌈
# = 🏆🏆🏆🏆
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
@ -254,8 +284,6 @@ def parse_medium_rss_export(rss_file):
root = etree.parse(rss_file).getroot()
items = root.find("channel").findall("item")
for item in items:
# for child in item:
# print(child.tag, child.text)
url = item.find("link").text
title = item.find("title").text.strip()
ts_str = item.find("pubDate").text
@ -274,31 +302,13 @@ def parse_plain_text_export(text_file):
"""Parse raw links from each line in a text file"""
text_file.seek(0)
text_content = text_file.readlines()
for line in text_content:
if line:
urls = re.findall(URL_REGEX, line)
for url in urls:
url = url.strip()
time = datetime.now()
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': None,
'tags': '',
'sources': [text_file.name],
}
PARSERS = OrderedDict([
('Pocket HTML', parse_pocket_html_export),
('Pinboard JSON', parse_pinboard_json_export),
('Netscape HTML', parse_netscape_html_export),
('RSS', parse_rss_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
('Plain Text', parse_plain_text_export),
])
for line in text_file.readlines():
urls = re.findall(URL_REGEX, line) if line.strip() else ()
for url in urls:
yield {
'url': url,
'timestamp': str(datetime.now().timestamp()),
'title': None,
'tags': '',
'sources': [text_file.name],
}

View file

@ -1,10 +1,64 @@
"""
Patches, additions, and shortcuts for Python standard library functions.
"""
### subprocess
from subprocess import (
Popen,
PIPE,
DEVNULL,
CompletedProcess,
TimeoutExpired,
CalledProcessError,
)
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process:
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired:
process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException as err:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
### collections
from sys import maxsize
from itertools import islice
from collections import deque
_marker = object()
class Peekable(object):
class PeekableGenerator:
"""Peekable version of a normal python generator.
Useful when you don't want to evaluate the entire iterable to look at
a specific item at a given idx.
@ -74,8 +128,6 @@ class Peekable(object):
return next(self._it)
next = __next__ # For Python 2 compatibility
def _get_slice(self, index):
# Normalize the slice's arguments
step = 1 if (index.step is None) else index.step

View file

@ -192,22 +192,27 @@
Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
&nbsp; | &nbsp;
Last updated: <small title="Timestamp: $updated">$updated_date</small>
&nbsp; | &nbsp;
Total files: <small title="Archive methods">🗃 $num_outputs</small>
</div>
<div class="col-lg-4 alert well">
Type:
<span class="badge badge-default">$extension</span>
&nbsp; | &nbsp;
Tags:
<span class="badge badge-success">$tags</span>
<span class="badge badge-warning">$tags</span>
&nbsp; | &nbsp;
Status:
<span class="badge badge-$status_color">$status</span>
</div>
<div class="col-lg-4 alert well">
Download:
Archive Methods:
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
<a href="git/" title="Any git repos at the url">Git Repos</a> |
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
<a href="." title="Webserver-provided index of files directory.">More files...</a>
<a href="." title="Webserver-provided index of files directory.">See all files...</a>
</div>
<hr/>
<div class="col-lg-2">

View file

@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote
from decimal import Decimal
from datetime import datetime
from multiprocessing import Process
from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
from stdlib_patches import run, PIPE, DEVNULL
from config import (
ANSI,
TERM_WIDTH,
@ -19,8 +19,6 @@ from config import (
OUTPUT_PERMISSIONS,
TIMEOUT,
SHOW_PROGRESS,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CURL_BINARY,
WGET_BINARY,
CHROME_BINARY,
@ -37,6 +35,13 @@ from config import (
FETCH_MEDIA,
SUBMIT_ARCHIVE_DOT_ORG,
ARCHIVE_DIR_NAME,
RESOLUTION,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CHROME_USER_AGENT,
CHROME_USER_DATA_DIR,
CHROME_HEADLESS,
CHROME_SANDBOX,
)
### Parsing Helpers
@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0]
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
URL_REGEX = re.compile(
r'http[s]?://' # start matching from allowed schemes
@ -109,66 +115,74 @@ def check_links_structure(links):
def check_dependencies():
"""Check that all necessary dependencies are installed, and have valid versions"""
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
try:
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_WGET or FETCH_WARC:
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_GIT:
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
if FETCH_MEDIA:
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (KeyboardInterrupt, Exception):
raise SystemExit(1)
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_WGET or FETCH_WARC:
if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_GIT:
if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
if FETCH_MEDIA:
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
print(' See https://github.com/pirate/ArchiveBox for help.')
raise SystemExit(1)
def check_url_parsing():
def check_url_parsing_invariants():
"""Check that plain text regex URL parsing works as expected"""
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving, as the consequences could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
@ -276,22 +290,9 @@ def wget_output_path(link):
if link.get('latest', {}).get('wget'):
return link['latest']['wget']
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
if is_static_file(link['url']):
return urlencode(without_scheme(without_fragment(link['url'])))
# Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder
# to see what html file wget actually created as the output
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
full_path = without_fragment(without_query(path(link['url']))).strip('/')
search_dir = os.path.join(
link_dir,
domain(link['url']),
full_path,
)
# Wget downloads can save in a number of different ways depending on the url
# https://example.com
# > output/archive/<timestamp>/example.com/index.html
@ -304,6 +305,19 @@ def wget_output_path(link):
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
full_path = without_fragment(without_query(path(link['url']))).strip('/')
search_dir = os.path.join(
link_dir,
domain(link['url']),
full_path,
)
for _ in range(4):
if os.path.exists(search_dir):
if os.path.isdir(search_dir):
@ -356,47 +370,6 @@ def str_between(string, start, end=None):
return content
def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '')
def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it save to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in cmd
)
# Prettify error output hints string and limit to five lines
hints = hints or getattr(err, 'hints', None)
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
else:
hints = ()
output_lines = [
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
*hints,
'Run to see full output:'
' cd {};'.format(pwd),
' {}'.format(quoted_cmd),
]
return '\n'.join(
'{}{}'.format(prefix, line)
for line in output_lines
if line
)
### Link Helpers
@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
print(' ', chmod_result.stderr.decode())
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
resolution=RESOLUTION, timeout=TIMEOUT):
"""helper to build up a chrome shell command with arguments"""
with Popen(*popenargs, **kwargs) as process:
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired:
process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException as err:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
global CACHED_USER_DATA_DIR
user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
cmd_args = [binary]
if headless:
cmd_args += ('--headless',)
if not sandbox:
# dont use GPU or sandbox when running inside docker container
cmd_args += ('--no-sandbox', '--disable-gpu')
if not check_ssl_validity:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if user_agent:
cmd_args += ('--user-agent={}'.format(user_agent),)
if resolution:
cmd_args += ('--window-size={}'.format(RESOLUTION),)
if timeout:
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
# Find chrome user data directory
default_profile_paths = (
'~/.config/chromium',
'~/.config/google-chrome',
'~/.config/google-chrome-beta',
'~/.config/google-chrome-unstable',
'~/Library/Application Support/Chromium',
'~/Library/Application Support/Google/Chrome',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Chromium/User Data',
'~/AppData/Local/Google/Chrome/User Data',
'~/AppData/Local/Google/Chrome SxS/User Data',
)
if user_data_dir:
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
else:
for path in default_profile_paths:
full_path = os.path.expanduser(path)
if os.path.exists(full_path):
CACHED_USER_DATA_DIR = full_path
cmd_args.append('--user-data-dir={}'.format(full_path))
break
return cmd_args