mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 20:23:12 +00:00
major codebase-wide code cleanups
This commit is contained in:
parent
c806068683
commit
e6bd1f8ca8
8 changed files with 825 additions and 743 deletions
|
@ -1,225 +1,132 @@
|
|||
#!/usr/bin/env python3
|
||||
# ArchiveBox
|
||||
# Nick Sweeting 2017 | MIT License
|
||||
# https://github.com/pirate/ArchiveBox
|
||||
"""
|
||||
ArchiveBox command line application.
|
||||
|
||||
./archive and ./bin/archivebox both point to this file,
|
||||
but you can also run it directly using `python3 archive.py`
|
||||
|
||||
Usage & Documentation:
|
||||
https://github.com/pirate/ArchiveBox/Wiki
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
from peekable import Peekable
|
||||
|
||||
|
||||
from parse import parse_links
|
||||
from links import validate_links, links_after_timestamp
|
||||
from archive_methods import archive_link, _RESULTS_TOTALS
|
||||
from index import (
|
||||
write_links_index,
|
||||
parse_json_links_index,
|
||||
)
|
||||
from links import links_after_timestamp
|
||||
from index import write_links_index, load_links_index
|
||||
from archive_methods import archive_link
|
||||
from config import (
|
||||
ARCHIVE_DIR,
|
||||
ONLY_NEW,
|
||||
OUTPUT_DIR,
|
||||
REPO_DIR,
|
||||
ANSI,
|
||||
GIT_SHA,
|
||||
)
|
||||
from util import (
|
||||
check_dependencies,
|
||||
save_remote_source,
|
||||
save_stdin_source,
|
||||
pretty_path,
|
||||
check_links_structure,
|
||||
)
|
||||
from logs import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
log_archiving_finished,
|
||||
)
|
||||
|
||||
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
||||
__VERSION__ = GIT_SHA
|
||||
__DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
|
||||
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
||||
|
||||
|
||||
def print_help():
|
||||
print(__DESCRIPTION__)
|
||||
print("Documentation: {}\n".format(__DOCUMENTATION__))
|
||||
print('ArchiveBox: The self-hosted internet archive.\n')
|
||||
print("Documentation:")
|
||||
print(" https://github.com/pirate/ArchiveBox/wiki\n")
|
||||
print("Usage:")
|
||||
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
|
||||
print("")
|
||||
print(" ./bin/archivebox https://example.com/feed.rss\n")
|
||||
print("")
|
||||
print(" echo 'https://examplecom' | ./bin/archivebox\n")
|
||||
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
|
||||
print(" ./bin/archivebox https://example.com/feed.rss\n")
|
||||
print(" ./bin/archivebox 15109948213.123\n")
|
||||
|
||||
|
||||
def load_links(archive_path=OUTPUT_DIR, import_path=None):
|
||||
"""get new links from file and optionally append them to links in existing archive"""
|
||||
|
||||
existing_links = []
|
||||
if archive_path:
|
||||
existing_links = parse_json_links_index(archive_path)
|
||||
check_links_structure(existing_links)
|
||||
|
||||
new_links = []
|
||||
if import_path:
|
||||
# parse and validate the import file
|
||||
raw_links, parser_name = parse_links(import_path)
|
||||
new_links = validate_links(raw_links)
|
||||
check_links_structure(new_links)
|
||||
|
||||
# merge existing links in archive_path and new links
|
||||
all_links = validate_links(existing_links + new_links)
|
||||
check_links_structure(all_links)
|
||||
num_new_links = len(all_links) - len(existing_links)
|
||||
|
||||
if import_path and parser_name:
|
||||
print(' > Adding {} new links to index (parsed import as {})'.format(
|
||||
num_new_links,
|
||||
parser_name,
|
||||
))
|
||||
|
||||
return all_links, new_links
|
||||
|
||||
|
||||
def update_archive(archive_path, links, source=None, resume=None, append=True):
|
||||
"""update or create index.html+json given a path to an export file containing new links"""
|
||||
|
||||
start_ts = datetime.now().timestamp()
|
||||
|
||||
if resume:
|
||||
print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
resume,
|
||||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(links),
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
check_links_structure(links)
|
||||
|
||||
# prefetch the first link off the generator so that if we pause or fail
|
||||
# immediately we can show that we paused on the first link and not just None
|
||||
to_archive = Peekable(links_after_timestamp(links, resume))
|
||||
idx, link = 0, to_archive.peek(0)
|
||||
|
||||
# loop over links and archive them
|
||||
try:
|
||||
check_dependencies()
|
||||
for idx, link in enumerate(to_archive):
|
||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||
archive_link(link_dir, link)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
||||
# if isinstance(e, KeyboardInterrupt):
|
||||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||
# all_links, _ = load_links(archive_path=out_dir)
|
||||
# write_links_index(out_dir=out_dir, links=all_links, finished=True)
|
||||
print()
|
||||
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
||||
**ANSI,
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
idx=idx+1,
|
||||
timestamp=link['timestamp'],
|
||||
total=len(links),
|
||||
))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' {} {}'.format(
|
||||
pretty_path(sys.argv[0]),
|
||||
link['timestamp'],
|
||||
))
|
||||
if not isinstance(e, KeyboardInterrupt):
|
||||
print()
|
||||
raise e
|
||||
raise SystemExit(1)
|
||||
|
||||
# print timing information & summary
|
||||
end_ts = datetime.now().timestamp()
|
||||
seconds = end_ts - start_ts
|
||||
if seconds > 60:
|
||||
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
||||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds, 2)
|
||||
|
||||
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(links),
|
||||
duration,
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
|
||||
print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
|
||||
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
argc = len(sys.argv)
|
||||
|
||||
if set(sys.argv).intersection(('-h', '--help', 'help')):
|
||||
def main(*args):
|
||||
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
|
||||
print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
source = sys.argv[1] if argc > 1 else None # path of links file to import
|
||||
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
|
||||
|
||||
stdin_raw_text = ''
|
||||
### Handle CLI arguments
|
||||
# ./archive bookmarks.html
|
||||
# ./archive 1523422111.234
|
||||
import_path, resume = None, None
|
||||
if len(args) == 2:
|
||||
# if the argument is a string, it's a import_path file to import
|
||||
# if it's a number, it's a timestamp to resume archiving from
|
||||
if args[1].replace('.', '').isdigit():
|
||||
import_path, resume = None, args[1]
|
||||
else:
|
||||
import_path, resume = args[1], None
|
||||
|
||||
### Set up output folder
|
||||
if not os.path.exists(OUTPUT_DIR):
|
||||
os.makedirs(OUTPUT_DIR)
|
||||
|
||||
### Handle ingesting urls piped in through stdin
|
||||
# (.e.g if user does cat example_urls.txt | ./archive)
|
||||
if not sys.stdin.isatty():
|
||||
stdin_raw_text = sys.stdin.read()
|
||||
if stdin_raw_text and import_path:
|
||||
print(
|
||||
'[X] You should pass either a path as an argument, '
|
||||
'or pass a list of links via stdin, but not both.\n'
|
||||
)
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
|
||||
if source and stdin_raw_text:
|
||||
print(
|
||||
'[X] You should pass either a path as an argument, '
|
||||
'or pass a list of links via stdin, but not both.\n'
|
||||
)
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
import_path = save_stdin_source(stdin_raw_text)
|
||||
|
||||
### Handle ingesting urls from a remote file/feed
|
||||
# (e.g. if an RSS feed URL is used as the import path)
|
||||
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
import_path = save_remote_source(import_path)
|
||||
|
||||
### Run the main archive update process
|
||||
update_archive_data(import_path=import_path, resume=resume)
|
||||
|
||||
|
||||
if argc == 1:
|
||||
source, resume = None, None
|
||||
elif argc == 2:
|
||||
if all(d.isdigit() for d in sys.argv[1].split('.')):
|
||||
# argv[1] is a resume timestamp
|
||||
source, resume = None, sys.argv[1]
|
||||
else:
|
||||
# argv[1] is a path to a file to import
|
||||
source, resume = sys.argv[1].strip(), None
|
||||
elif argc == 3:
|
||||
source, resume = sys.argv[1].strip(), sys.argv[2]
|
||||
else:
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
def update_archive_data(import_path=None, resume=None):
|
||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||
check_dependencies()
|
||||
|
||||
# See if archive folder already exists
|
||||
for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
|
||||
if os.path.exists(out_dir):
|
||||
break
|
||||
else:
|
||||
out_dir = OUTPUT_DIR
|
||||
# Step 1: Load list of links from the existing index
|
||||
# merge in and dedupe new links from import_path
|
||||
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
|
||||
|
||||
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
|
||||
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
source = save_remote_source(source)
|
||||
elif stdin_raw_text:
|
||||
source = save_stdin_source(stdin_raw_text)
|
||||
|
||||
# Step 1: Parse the links and dedupe them with existing archive
|
||||
all_links, new_links = load_links(archive_path=out_dir, import_path=source)
|
||||
|
||||
# Step 2: Write new index
|
||||
write_links_index(out_dir=out_dir, links=all_links)
|
||||
# Step 2: Write updated index with deduped old and new links back to disk
|
||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links)
|
||||
|
||||
# Step 3: Run the archive methods for each link
|
||||
if ONLY_NEW:
|
||||
update_archive(out_dir, new_links, source=source, resume=resume, append=True)
|
||||
else:
|
||||
update_archive(out_dir, all_links, source=source, resume=resume, append=True)
|
||||
links = new_links if ONLY_NEW else all_links
|
||||
log_archiving_started(len(links), resume)
|
||||
idx, link = 0, 0
|
||||
try:
|
||||
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||
archive_link(link_dir, link)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link and link['timestamp'])
|
||||
raise SystemExit(0)
|
||||
|
||||
except:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
all_links, _ = load_links(archive_path=out_dir)
|
||||
write_links_index(out_dir=out_dir, links=all_links, finished=True)
|
||||
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(*sys.argv)
|
||||
|
|
|
@ -3,18 +3,18 @@ import os
|
|||
from functools import wraps
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from stdlib_patches import run, PIPE, DEVNULL
|
||||
|
||||
from index import (
|
||||
parse_json_link_index,
|
||||
write_link_index,
|
||||
update_main_index,
|
||||
patch_links_index,
|
||||
load_json_link_index,
|
||||
)
|
||||
from config import (
|
||||
CURL_BINARY,
|
||||
GIT_BINARY,
|
||||
WGET_BINARY,
|
||||
YOUTUBEDL_BINARY,
|
||||
CHROME_BINARY,
|
||||
FETCH_FAVICON,
|
||||
FETCH_TITLE,
|
||||
FETCH_WGET,
|
||||
|
@ -25,62 +25,37 @@ from config import (
|
|||
FETCH_WARC,
|
||||
FETCH_GIT,
|
||||
FETCH_MEDIA,
|
||||
RESOLUTION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SUBMIT_ARCHIVE_DOT_ORG,
|
||||
COOKIES_FILE,
|
||||
WGET_USER_AGENT,
|
||||
CHROME_USER_AGENT,
|
||||
CHROME_USER_DATA_DIR,
|
||||
CHROME_HEADLESS,
|
||||
CHROME_SANDBOX,
|
||||
TIMEOUT,
|
||||
MEDIA_TIMEOUT,
|
||||
ANSI,
|
||||
ARCHIVE_DIR,
|
||||
OUTPUT_DIR,
|
||||
GIT_DOMAINS,
|
||||
GIT_SHA,
|
||||
WGET_USER_AGENT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
from util import (
|
||||
domain,
|
||||
extension,
|
||||
without_query,
|
||||
without_fragment,
|
||||
fetch_page_title,
|
||||
is_static_file,
|
||||
progress,
|
||||
chmod_file,
|
||||
pretty_path,
|
||||
print_error_hints,
|
||||
check_link_structure,
|
||||
wget_output_path,
|
||||
run, PIPE, DEVNULL,
|
||||
chrome_args,
|
||||
)
|
||||
from logs import (
|
||||
_LAST_RUN_STATS,
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_failed,
|
||||
)
|
||||
|
||||
|
||||
_RESULTS_TOTALS = { # globals are bad, mmkay
|
||||
'skipped': 0,
|
||||
'succeded': 0,
|
||||
'failed': 0,
|
||||
}
|
||||
|
||||
def load_link_index(link_dir, link):
|
||||
"""check for an existing link archive in the given directory,
|
||||
and load+merge it into the given link dict
|
||||
"""
|
||||
is_new = not os.path.exists(link_dir)
|
||||
if is_new:
|
||||
os.makedirs(link_dir)
|
||||
else:
|
||||
link = {
|
||||
**parse_json_link_index(link_dir),
|
||||
**link,
|
||||
}
|
||||
|
||||
check_link_structure(link)
|
||||
print_link_status_line(link_dir, link, is_new)
|
||||
|
||||
return link
|
||||
|
||||
|
||||
class ArchiveError(Exception):
|
||||
def __init__(self, message, hints=None):
|
||||
|
@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True):
|
|||
active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
|
||||
|
||||
try:
|
||||
link = load_link_index(link_dir, link)
|
||||
is_new = not os.path.exists(link_dir)
|
||||
if is_new:
|
||||
os.makedirs(link_dir)
|
||||
|
||||
link = load_json_link_index(link_dir, link)
|
||||
log_link_archiving_started(link_dir, link, is_new)
|
||||
|
||||
for archive_method in active_methods:
|
||||
archive_method(link_dir, link, overwrite=overwrite)
|
||||
|
||||
|
||||
write_link_index(link_dir, link)
|
||||
update_main_index(link)
|
||||
patch_links_index(link)
|
||||
|
||||
except Exception as err:
|
||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||
|
||||
return link
|
||||
|
||||
def print_link_status_line(link_dir, link, is_new):
|
||||
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
|
||||
symbol='+' if is_new else '*',
|
||||
symbol_color=ANSI['green' if is_new else 'black'],
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
**{**link, 'title': link['title'] or link['url']},
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
|
||||
|
||||
|
||||
|
||||
def attach_result_to_link(method):
|
||||
"""
|
||||
|
@ -178,15 +145,75 @@ def attach_result_to_link(method):
|
|||
link['history'][method].append(history_entry)
|
||||
link['latest'][method] = result['output']
|
||||
|
||||
_RESULTS_TOTALS[history_entry['status']] += 1
|
||||
_LAST_RUN_STATS[history_entry['status']] += 1
|
||||
|
||||
return link
|
||||
return timed_fetch_func
|
||||
return decorator
|
||||
|
||||
@attach_result_to_link('title')
|
||||
def fetch_title(link_dir, link, timeout=TIMEOUT):
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
# if link already has valid title, skip it
|
||||
if link['title'] and not link['title'].lower().startswith('http'):
|
||||
return {'output': link['title'], 'status': 'skipped'}
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return {'output': None, 'status': 'skipped'}
|
||||
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
|
||||
end()
|
||||
output = title
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
|
||||
if title and title.strip():
|
||||
link['title'] = title
|
||||
output = title
|
||||
|
||||
return {
|
||||
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
||||
'output': output,
|
||||
}
|
||||
|
||||
@attach_result_to_link('favicon')
|
||||
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
output = 'favicon.ico'
|
||||
if os.path.exists(os.path.join(link_dir, output)):
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
CURL_BINARY,
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--output', output,
|
||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||
end()
|
||||
chmod_file(output, cwd=link_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||
|
||||
return {
|
||||
'cmd': CMD,
|
||||
'output': output,
|
||||
}
|
||||
|
||||
@attach_result_to_link('wget')
|
||||
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
|
||||
def fetch_wget(link_dir, link, timeout=TIMEOUT):
|
||||
"""download full site using wget"""
|
||||
|
||||
domain_dir = os.path.join(link_dir, domain(link['url']))
|
||||
|
@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
|||
if os.path.exists(domain_dir) and existing_file:
|
||||
return {'output': existing_file, 'status': 'skipped'}
|
||||
|
||||
if warc:
|
||||
if FETCH_WARC:
|
||||
warc_dir = os.path.join(link_dir, 'warc')
|
||||
os.makedirs(warc_dir, exist_ok=True)
|
||||
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
|
||||
|
@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
|||
'-e', 'robots=off',
|
||||
'--restrict-file-names=unix',
|
||||
'--timeout={}'.format(timeout),
|
||||
*(() if warc else ('--timestamping',)),
|
||||
*(('--warc-file={}'.format(warc_path),) if warc else ()),
|
||||
*(() if FETCH_WARC else ('--timestamping',)),
|
||||
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
|
||||
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
||||
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
||||
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
|
||||
|
@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
|||
if line.strip()
|
||||
]
|
||||
|
||||
# parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
files_downloaded = (
|
||||
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
|
||||
if 'Downloaded:' in output_tail[-1]
|
||||
|
@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
|||
'output': output,
|
||||
}
|
||||
|
||||
|
||||
@attach_result_to_link('pdf')
|
||||
def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
||||
def fetch_pdf(link_dir, link, timeout=TIMEOUT):
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return {'output': wget_output_path(link), 'status': 'skipped'}
|
||||
return {'output': None, 'status': 'skipped'}
|
||||
|
||||
output = 'output.pdf'
|
||||
if os.path.exists(os.path.join(link_dir, output)):
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
*chrome_headless(timeout=timeout, **chrome_kwargs),
|
||||
*chrome_args(timeout=timeout),
|
||||
'--print-to-pdf',
|
||||
link['url']
|
||||
]
|
||||
|
@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
|||
}
|
||||
|
||||
@attach_result_to_link('screenshot')
|
||||
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
||||
def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return {'output': wget_output_path(link), 'status': 'skipped'}
|
||||
return {'output': None, 'status': 'skipped'}
|
||||
|
||||
output = 'screenshot.png'
|
||||
if os.path.exists(os.path.join(link_dir, output)):
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
*chrome_headless(timeout=timeout, **chrome_kwargs),
|
||||
*chrome_args(timeout=timeout),
|
||||
'--screenshot',
|
||||
link['url'],
|
||||
]
|
||||
|
@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
|||
}
|
||||
|
||||
@attach_result_to_link('dom')
|
||||
def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
||||
def fetch_dom(link_dir, link, timeout=TIMEOUT):
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return {'output': wget_output_path(link), 'status': 'skipped'}
|
||||
return {'output': None, 'status': 'skipped'}
|
||||
|
||||
output = 'output.html'
|
||||
if os.path.exists(os.path.join(link_dir, output)):
|
||||
output_path = os.path.join(link_dir, output)
|
||||
if os.path.exists(output_path):
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
*chrome_headless(timeout=timeout, **chrome_kwargs),
|
||||
*chrome_args(timeout=timeout),
|
||||
'--dump-dom',
|
||||
link['url']
|
||||
]
|
||||
|
@ -372,6 +400,116 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
|||
'output': output,
|
||||
}
|
||||
|
||||
@attach_result_to_link('git')
|
||||
def fetch_git(link_dir, link, timeout=TIMEOUT):
|
||||
"""download full site using git"""
|
||||
|
||||
is_clonable_url = (
|
||||
domain(link['url']) in GIT_DOMAINS
|
||||
or extension(link['url']) == 'git'
|
||||
)
|
||||
if is_static_file(link['url']) or not is_clonable_url:
|
||||
return {'output': None, 'status': 'skipped'}
|
||||
|
||||
output = 'git'
|
||||
output_path = os.path.join(link_dir, 'git')
|
||||
|
||||
if os.path.exists(output_path):
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
CMD = [
|
||||
GIT_BINARY,
|
||||
'clone',
|
||||
'--mirror',
|
||||
'--recursive',
|
||||
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
||||
without_query(without_fragment(link['url'])),
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
end()
|
||||
|
||||
if result.returncode == 128:
|
||||
# ignore failed re-download when the folder already exists
|
||||
pass
|
||||
elif result.returncode > 0:
|
||||
hints = 'got git response code {}:'.format(result.returncode)
|
||||
raise ArchiveError('Failed git download', hints)
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||
|
||||
return {
|
||||
'cmd': CMD,
|
||||
'output': output,
|
||||
}
|
||||
|
||||
@attach_result_to_link('media')
|
||||
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
|
||||
output = 'media'
|
||||
output_path = os.path.join(link_dir, 'media')
|
||||
|
||||
if os.path.exists(output_path) and not overwrite:
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
CMD = [
|
||||
YOUTUBEDL_BINARY,
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--yes-playlist',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--no-check-certificate',
|
||||
'--user-agent',
|
||||
'--all-subs',
|
||||
'--extract-audio',
|
||||
'--keep-video',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--audio-format', 'mp3',
|
||||
'--audio-quality', '320K',
|
||||
'--embed-thumbnail',
|
||||
'--add-metadata',
|
||||
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
||||
link['url'],
|
||||
]
|
||||
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
chmod_file(output, cwd=link_dir)
|
||||
end()
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
or b'HTTP Error 404' in result.stderr
|
||||
or b'HTTP Error 403' in result.stderr
|
||||
or b'URL could be a direct video link' in result.stderr
|
||||
or b'Unable to extract container ID' in result.stderr):
|
||||
# These happen too frequently on non-media pages to warrant printing to console
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'got youtubedl response code {}:'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to download media', hints)
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||
|
||||
return {
|
||||
'cmd': CMD,
|
||||
'output': output,
|
||||
}
|
||||
|
||||
def parse_archive_dot_org_response(response):
|
||||
# Parse archive.org response headers
|
||||
headers = defaultdict(list)
|
||||
|
@ -445,226 +583,4 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
|||
'output': output,
|
||||
}
|
||||
|
||||
@attach_result_to_link('favicon')
|
||||
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
output = 'favicon.ico'
|
||||
if os.path.exists(os.path.join(link_dir, output)):
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
CURL_BINARY,
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--output', output,
|
||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||
end()
|
||||
chmod_file(output, cwd=link_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||
|
||||
return {
|
||||
'cmd': CMD,
|
||||
'output': output,
|
||||
}
|
||||
|
||||
@attach_result_to_link('title')
|
||||
def fetch_title(link_dir, link, timeout=TIMEOUT):
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
# if link already has valid title, skip it
|
||||
if link['title'] and not link['title'].lower().startswith('http'):
|
||||
return {'output': link['title'], 'status': 'skipped'}
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return {'output': None, 'status': 'skipped'}
|
||||
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
|
||||
end()
|
||||
output = title
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
|
||||
if title and title.strip():
|
||||
link['title'] = title
|
||||
output = title
|
||||
|
||||
return {
|
||||
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
||||
'output': output,
|
||||
}
|
||||
|
||||
@attach_result_to_link('media')
|
||||
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
|
||||
output = 'media'
|
||||
output_path = os.path.join(link_dir, 'media')
|
||||
|
||||
if os.path.exists(output_path) and not overwrite:
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
CMD = [
|
||||
YOUTUBEDL_BINARY,
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--yes-playlist',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--no-check-certificate',
|
||||
'--user-agent',
|
||||
'--all-subs',
|
||||
'--extract-audio',
|
||||
'--keep-video',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--audio-format', 'mp3',
|
||||
'--audio-quality', '320K',
|
||||
'--embed-thumbnail',
|
||||
'--add-metadata',
|
||||
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
||||
link['url'],
|
||||
]
|
||||
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
chmod_file(output, cwd=link_dir)
|
||||
end()
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
or b'HTTP Error 404' in result.stderr
|
||||
or b'HTTP Error 403' in result.stderr
|
||||
or b'URL could be a direct video link' in result.stderr
|
||||
or b'Unable to extract container ID' in result.stderr):
|
||||
# These happen too frequently on non-media pages to warrant printing to console
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'got youtubedl response code {}:'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to download media', hints)
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||
|
||||
return {
|
||||
'cmd': CMD,
|
||||
'output': output,
|
||||
}
|
||||
|
||||
|
||||
@attach_result_to_link('git')
|
||||
def fetch_git(link_dir, link, timeout=TIMEOUT):
|
||||
"""download full site using git"""
|
||||
|
||||
url_is_clonable = (
|
||||
domain(link['url']) in GIT_DOMAINS
|
||||
or link['url'].endswith('.git')
|
||||
)
|
||||
if not url_is_clonable or is_static_file(link['url']):
|
||||
return {'output': None, 'status': 'skipped'}
|
||||
|
||||
output = 'git'
|
||||
output_path = os.path.join(link_dir, 'git')
|
||||
|
||||
if os.path.exists(output_path):
|
||||
return {'output': output, 'status': 'skipped'}
|
||||
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
CMD = [
|
||||
GIT_BINARY,
|
||||
'clone',
|
||||
'--mirror',
|
||||
'--recursive',
|
||||
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
||||
without_query(without_fragment(link['url'])),
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
end()
|
||||
|
||||
if result.returncode == 128:
|
||||
# ignore failed re-download when the folder already exists
|
||||
pass
|
||||
elif result.returncode > 0:
|
||||
hints = 'got git response code {}:'.format(result.returncode)
|
||||
raise ArchiveError('Failed git download', hints)
|
||||
except Exception as e:
|
||||
end()
|
||||
output = e
|
||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||
|
||||
return {
|
||||
'cmd': CMD,
|
||||
'output': output,
|
||||
}
|
||||
|
||||
def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT):
|
||||
global CACHED_USER_DATA_DIR
|
||||
user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
|
||||
cmd_args = [binary]
|
||||
|
||||
if headless:
|
||||
cmd_args += ('--headless',)
|
||||
|
||||
if not sandbox:
|
||||
# dont use GPU or sandbox when running inside docker container
|
||||
cmd_args += ('--no-sandbox', '--disable-gpu')
|
||||
|
||||
if not check_ssl_validity:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
||||
if user_agent:
|
||||
cmd_args += ('--user-agent={}'.format(user_agent),)
|
||||
|
||||
if resolution:
|
||||
cmd_args += ('--window-size={}'.format(RESOLUTION),)
|
||||
|
||||
if timeout:
|
||||
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
|
||||
|
||||
# Find chrome user data directory
|
||||
default_profile_paths = (
|
||||
'~/.config/chromium',
|
||||
'~/.config/google-chrome',
|
||||
'~/.config/google-chrome-beta',
|
||||
'~/.config/google-chrome-unstable',
|
||||
'~/Library/Application Support/Chromium',
|
||||
'~/Library/Application Support/Google/Chrome',
|
||||
'~/Library/Application Support/Google/Chrome Canary',
|
||||
'~/AppData/Local/Chromium/User Data',
|
||||
'~/AppData/Local/Google/Chrome/User Data',
|
||||
'~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
)
|
||||
if user_data_dir:
|
||||
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
|
||||
else:
|
||||
for path in default_profile_paths:
|
||||
full_path = os.path.expanduser(path)
|
||||
if os.path.exists(full_path):
|
||||
CACHED_USER_DATA_DIR = full_path
|
||||
cmd_args.append('--user-data-dir={}'.format(full_path))
|
||||
break
|
||||
|
||||
return cmd_args
|
||||
|
||||
|
||||
CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
|
||||
|
|
|
@ -12,18 +12,24 @@ except ImportError:
|
|||
from config import (
|
||||
OUTPUT_DIR,
|
||||
TEMPLATES_DIR,
|
||||
ANSI,
|
||||
GIT_SHA,
|
||||
FOOTER_INFO,
|
||||
)
|
||||
from util import (
|
||||
chmod_file,
|
||||
derived_link_info,
|
||||
pretty_path,
|
||||
check_link_structure,
|
||||
check_links_structure,
|
||||
wget_output_path,
|
||||
)
|
||||
from parse import parse_links
|
||||
from links import validate_links
|
||||
from logs import (
|
||||
log_indexing_started,
|
||||
log_indexing_finished,
|
||||
log_parsing_started,
|
||||
log_parsing_finished,
|
||||
)
|
||||
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
|
@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
|||
def write_links_index(out_dir, links, finished=False):
|
||||
"""create index.html file for a given list of links"""
|
||||
|
||||
log_indexing_started()
|
||||
check_links_structure(links)
|
||||
|
||||
if not os.path.exists(out_dir):
|
||||
os.makedirs(out_dir)
|
||||
|
||||
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
**ANSI,
|
||||
))
|
||||
write_json_links_index(out_dir, links)
|
||||
print(' > {}/index.json'.format(pretty_path(out_dir)))
|
||||
log_indexing_finished(out_dir, 'index.json')
|
||||
|
||||
write_html_links_index(out_dir, links, finished=finished)
|
||||
print(' > {}/index.html'.format(pretty_path(out_dir)))
|
||||
log_indexing_finished(out_dir, 'index.html')
|
||||
|
||||
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
|
||||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
|
||||
existing_links = []
|
||||
if out_dir:
|
||||
existing_links = parse_json_links_index(out_dir)
|
||||
check_links_structure(existing_links)
|
||||
|
||||
new_links = []
|
||||
if import_path:
|
||||
# parse and validate the import file
|
||||
log_parsing_started(import_path)
|
||||
raw_links, parser_name = parse_links(import_path)
|
||||
new_links = validate_links(raw_links)
|
||||
check_links_structure(new_links)
|
||||
|
||||
# merge existing links in out_dir and new links
|
||||
all_links = validate_links(existing_links + new_links)
|
||||
check_links_structure(all_links)
|
||||
num_new_links = len(all_links) - len(existing_links)
|
||||
|
||||
if import_path and parser_name:
|
||||
log_parsing_finished(num_new_links, parser_name)
|
||||
|
||||
return all_links, new_links
|
||||
|
||||
def write_json_links_index(out_dir, links):
|
||||
"""write the json link index to a given path"""
|
||||
|
@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links):
|
|||
|
||||
chmod_file(path)
|
||||
|
||||
def parse_json_links_index(out_dir):
|
||||
"""load the index in a given directory and merge it with the given link"""
|
||||
def parse_json_links_index(out_dir=OUTPUT_DIR):
|
||||
"""parse a archive index json file and return the list of links"""
|
||||
index_path = os.path.join(out_dir, 'index.json')
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
|
@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False):
|
|||
chmod_file(path)
|
||||
|
||||
|
||||
def update_main_index(link):
|
||||
def patch_links_index(link, out_dir=OUTPUT_DIR):
|
||||
"""hack to in-place update one row's info in the generated index html"""
|
||||
|
||||
title = link['latest']['title']
|
||||
successful = len([entry for entry in link['latest'].values() if entry])
|
||||
|
||||
# Patch JSON index
|
||||
json_path = os.path.join(OUTPUT_DIR, 'index.json')
|
||||
|
||||
links = parse_json_links_index(OUTPUT_DIR)
|
||||
|
||||
changed = False
|
||||
for json_link in links:
|
||||
if json_link['url'] == link['url']:
|
||||
json_link['title'] = title
|
||||
json_link['latest'] = link['latest']
|
||||
json_file_links = parse_json_links_index(out_dir)
|
||||
for saved_link in json_file_links:
|
||||
if saved_link['url'] == link['url']:
|
||||
saved_link['title'] = title
|
||||
saved_link['latest'] = link['latest']
|
||||
changed = True
|
||||
break
|
||||
|
||||
if changed:
|
||||
write_json_links_index(OUTPUT_DIR, links)
|
||||
write_json_links_index(out_dir, json_file_links)
|
||||
|
||||
# Patch HTML index
|
||||
html_path = os.path.join(OUTPUT_DIR, 'index.html')
|
||||
|
||||
html_path = os.path.join(out_dir, 'index.html')
|
||||
html = open(html_path, 'r').read().split('\n')
|
||||
for idx, line in enumerate(html):
|
||||
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
|
||||
|
@ -172,6 +192,7 @@ def update_main_index(link):
|
|||
with open(html_path, 'w') as f:
|
||||
f.write('\n'.join(html))
|
||||
|
||||
|
||||
### Individual link index
|
||||
|
||||
def write_link_index(out_dir, link):
|
||||
|
@ -202,6 +223,18 @@ def parse_json_link_index(out_dir):
|
|||
return link_json
|
||||
return {}
|
||||
|
||||
def load_json_link_index(out_dir, link):
|
||||
"""check for an existing link archive in the given directory,
|
||||
and load+merge it into the given link dict
|
||||
"""
|
||||
link = {
|
||||
**parse_json_link_index(out_dir),
|
||||
**link,
|
||||
}
|
||||
|
||||
check_link_structure(link)
|
||||
return link
|
||||
|
||||
def write_html_link_index(out_dir, link):
|
||||
check_link_structure(link)
|
||||
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
||||
|
@ -224,7 +257,10 @@ def write_html_link_index(out_dir, link):
|
|||
wget_output_path(link)
|
||||
or (link['domain'] if link['is_archived'] else 'about:blank')
|
||||
),
|
||||
'extension': link['extension'] or 'HTML',
|
||||
'extension': link['extension'] or 'html',
|
||||
'tags': link['tags'].strip() or 'untagged',
|
||||
'status': 'Archived' if link['is_archived'] else 'Not yet archived',
|
||||
'status_color': 'success' if link['is_archived'] else 'danger',
|
||||
}))
|
||||
|
||||
chmod_file(path)
|
||||
|
|
161
archivebox/logs.py
Normal file
161
archivebox/logs.py
Normal file
|
@ -0,0 +1,161 @@
|
|||
import sys
|
||||
from datetime import datetime
|
||||
from config import ANSI, REPO_DIR, OUTPUT_DIR
|
||||
|
||||
|
||||
# globals are bad, mmkay
|
||||
_LAST_RUN_STATS = {
|
||||
'skipped': 0,
|
||||
'succeded': 0,
|
||||
'failed': 0,
|
||||
|
||||
'parsing_start_ts': 0,
|
||||
'parsing_end_ts': 0,
|
||||
|
||||
'indexing_start_ts': 0,
|
||||
'indexing_end_ts': 0,
|
||||
|
||||
'archiving_start_ts': 0,
|
||||
'archiving_end_ts': 0,
|
||||
|
||||
'links': {},
|
||||
}
|
||||
|
||||
def pretty_path(path):
|
||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||
return path.replace(REPO_DIR + '/', '')
|
||||
|
||||
|
||||
def log_link_archiving_started(link_dir, link, is_new):
|
||||
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
|
||||
symbol='+' if is_new else '*',
|
||||
symbol_color=ANSI['green' if is_new else 'black'],
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
**{**link, 'title': link['title'] or link['url']},
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
|
||||
|
||||
|
||||
def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix=' '):
|
||||
"""quote the argument with whitespace in a command so the user can
|
||||
copy-paste the outputted string directly to run the cmd
|
||||
"""
|
||||
|
||||
# Prettify CMD string and make it save to copy-paste by quoting arguments
|
||||
quoted_cmd = ' '.join(
|
||||
'"{}"'.format(arg) if ' ' in arg else arg
|
||||
for arg in cmd
|
||||
)
|
||||
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = hints or getattr(err, 'hints', None)
|
||||
if hints:
|
||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
)
|
||||
else:
|
||||
hints = ()
|
||||
|
||||
output_lines = [
|
||||
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
|
||||
*hints,
|
||||
'Run to see full output:'
|
||||
' cd {};'.format(pwd),
|
||||
' {}'.format(quoted_cmd),
|
||||
]
|
||||
|
||||
return '\n'.join(
|
||||
'{}{}'.format(prefix, line)
|
||||
for line in output_lines
|
||||
if line
|
||||
)
|
||||
|
||||
### Logging Helpers
|
||||
|
||||
def log_parsing_started(source_file):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS['parse_start_ts'] = start_ts
|
||||
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
source_file.rsplit('/', 1)[-1],
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_parsing_finished(num_new_links, parser_name):
|
||||
print(' > Adding {} new links to index (parsed import as {})'.format(
|
||||
num_new_links,
|
||||
parser_name,
|
||||
))
|
||||
|
||||
def log_indexing_started():
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS['index_start_ts'] = start_ts
|
||||
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_indexing_finished(out_dir, out_file):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS['index_end_ts'] = end_ts
|
||||
print(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
||||
|
||||
def log_archiving_started(num_links, resume):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS['start_ts'] = start_ts
|
||||
if resume:
|
||||
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
resume,
|
||||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_archiving_paused(num_links, idx, timestamp):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS['end_ts'] = end_ts
|
||||
print()
|
||||
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
||||
**ANSI,
|
||||
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
idx=idx+1,
|
||||
timestamp=timestamp,
|
||||
total=num_links,
|
||||
))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' {} {}'.format(
|
||||
pretty_path(sys.argv[0]),
|
||||
timestamp,
|
||||
))
|
||||
|
||||
def log_archiving_finished(num_links):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS['end_ts'] = end_ts
|
||||
seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp()
|
||||
if seconds > 60:
|
||||
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
||||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds, 2)
|
||||
|
||||
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
||||
ANSI['green'],
|
||||
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
duration,
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' - {} entries skipped'.format(_LAST_RUN_STATS['skipped']))
|
||||
print(' - {} entries updated'.format(_LAST_RUN_STATS['succeded']))
|
||||
print(' - {} errors'.format(_LAST_RUN_STATS['failed']))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
|
@ -1,17 +1,19 @@
|
|||
# coding: utf-8
|
||||
|
||||
"""
|
||||
Everything related to parsing links from bookmark services.
|
||||
Everything related to parsing links from input sources.
|
||||
|
||||
For a list of supported services, see the README.md.
|
||||
For examples of supported files see examples/.
|
||||
For examples of supported import formats see tests/.
|
||||
|
||||
Parsed link schema: {
|
||||
Link: {
|
||||
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
|
||||
'timestamp': '15442123124234',
|
||||
'timestamp': '1544212312.4234',
|
||||
'title': 'Example.com Page Title',
|
||||
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
|
||||
'tags': 'abc,def',
|
||||
'sources': [
|
||||
'output/sources/ril_export.html',
|
||||
'output/sources/getpocket.com-1523422111.txt',
|
||||
'output/sources/stdin-234234112312.txt'
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
|
@ -19,45 +21,59 @@ import re
|
|||
import json
|
||||
|
||||
from datetime import datetime
|
||||
from collections import OrderedDict
|
||||
import xml.etree.ElementTree as etree
|
||||
|
||||
from config import ANSI
|
||||
from config import TIMEOUT
|
||||
from util import (
|
||||
str_between,
|
||||
URL_REGEX,
|
||||
check_url_parsing,
|
||||
check_url_parsing_invariants,
|
||||
progress,
|
||||
)
|
||||
|
||||
|
||||
def parse_links(path):
|
||||
"""parse a list of links dictionaries from a bookmark export file"""
|
||||
|
||||
check_url_parsing()
|
||||
def parse_links(source_file):
|
||||
"""parse a list of URLs with their metadata from an
|
||||
RSS feed, bookmarks export, or text file
|
||||
"""
|
||||
|
||||
links = []
|
||||
with open(path, 'r', encoding='utf-8') as file:
|
||||
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
path.rsplit('/', 1)[-1],
|
||||
**ANSI,
|
||||
))
|
||||
check_url_parsing_invariants()
|
||||
PARSERS = (
|
||||
# Specialized parsers
|
||||
('Pocket HTML', parse_pocket_html_export),
|
||||
('Pinboard RSS', parse_pinboard_rss_export),
|
||||
('Shaarli RSS', parse_shaarli_rss_export),
|
||||
('Medium RSS', parse_medium_rss_export),
|
||||
|
||||
# General parsers
|
||||
('Netscape HTML', parse_netscape_html_export),
|
||||
('Generic RSS', parse_rss_export),
|
||||
('Generic JSON', parse_json_export),
|
||||
|
||||
for parser_name, parser_func in PARSERS.items():
|
||||
# Fallback parser
|
||||
('Plain Text', parse_plain_text_export),
|
||||
)
|
||||
end = progress(TIMEOUT * 4, prefix=' ')
|
||||
with open(source_file, 'r', encoding='utf-8') as file:
|
||||
for parser_name, parser_func in PARSERS:
|
||||
try:
|
||||
links += list(parser_func(file))
|
||||
links = list(parser_func(file))
|
||||
if links:
|
||||
break
|
||||
end()
|
||||
return links, parser_name
|
||||
except Exception as err:
|
||||
# we try each parser one by one, wong parsers will throw exeptions
|
||||
# if unsupported and we accept the first one that passes
|
||||
# uncomment the following line to see why the parser was unsupported for each attempted format
|
||||
# Parsers are tried one by one down the list, and the first one
|
||||
# that succeeds is used. To see why a certain parser was not used
|
||||
# due to error or format incompatibility, uncomment this line:
|
||||
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
||||
pass
|
||||
|
||||
return links, parser_name
|
||||
end()
|
||||
return [], 'Plain Text'
|
||||
|
||||
|
||||
### Import Parser Functions
|
||||
|
||||
def parse_pocket_html_export(html_file):
|
||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||
|
||||
|
@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file):
|
|||
'sources': [html_file.name],
|
||||
}
|
||||
|
||||
def parse_pinboard_json_export(json_file):
|
||||
|
||||
def parse_json_export(json_file):
|
||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||
|
||||
json_file.seek(0)
|
||||
json_content = json.load(json_file)
|
||||
for line in json_content:
|
||||
links = json.load(json_file)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
for link in links:
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
if line:
|
||||
erg = line
|
||||
if erg.get('timestamp'):
|
||||
timestamp = str(erg['timestamp']/10000000) # chrome/ff histories use a very precise timestamp
|
||||
elif erg.get('time'):
|
||||
timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
|
||||
elif erg.get('created_at'):
|
||||
timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
|
||||
else:
|
||||
timestamp = str(datetime.now().timestamp())
|
||||
if erg.get('href'):
|
||||
url = erg['href']
|
||||
else:
|
||||
url = erg['url']
|
||||
if erg.get('description'):
|
||||
title = (erg.get('description') or '').replace(' — Readability', '')
|
||||
else:
|
||||
title = erg['title'].strip()
|
||||
if link:
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
info = {
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now().timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip() or None
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip() or None
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip() or None
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': timestamp,
|
||||
'title': title or None,
|
||||
'tags': erg.get('tags') or '',
|
||||
'timestamp': ts_str,
|
||||
'title': title,
|
||||
'tags': link.get('tags') or '',
|
||||
'sources': [json_file.name],
|
||||
}
|
||||
yield info
|
||||
|
||||
|
||||
def parse_rss_export(rss_file):
|
||||
|
@ -139,15 +172,15 @@ def parse_rss_export(rss_file):
|
|||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': title or None,
|
||||
'title': title,
|
||||
'tags': '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
|
@ -224,9 +257,6 @@ def parse_pinboard_rss_export(rss_file):
|
|||
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
|
||||
title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
|
||||
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
|
||||
# = 🌈🌈🌈🌈
|
||||
# = 🌈🌈🌈🌈
|
||||
# = 🏆🏆🏆🏆
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
|
@ -254,8 +284,6 @@ def parse_medium_rss_export(rss_file):
|
|||
root = etree.parse(rss_file).getroot()
|
||||
items = root.find("channel").findall("item")
|
||||
for item in items:
|
||||
# for child in item:
|
||||
# print(child.tag, child.text)
|
||||
url = item.find("link").text
|
||||
title = item.find("title").text.strip()
|
||||
ts_str = item.find("pubDate").text
|
||||
|
@ -274,31 +302,13 @@ def parse_plain_text_export(text_file):
|
|||
"""Parse raw links from each line in a text file"""
|
||||
|
||||
text_file.seek(0)
|
||||
text_content = text_file.readlines()
|
||||
for line in text_content:
|
||||
if line:
|
||||
urls = re.findall(URL_REGEX, line)
|
||||
|
||||
for url in urls:
|
||||
url = url.strip()
|
||||
time = datetime.now()
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': None,
|
||||
'tags': '',
|
||||
'sources': [text_file.name],
|
||||
}
|
||||
|
||||
|
||||
PARSERS = OrderedDict([
|
||||
('Pocket HTML', parse_pocket_html_export),
|
||||
('Pinboard JSON', parse_pinboard_json_export),
|
||||
('Netscape HTML', parse_netscape_html_export),
|
||||
('RSS', parse_rss_export),
|
||||
('Pinboard RSS', parse_pinboard_rss_export),
|
||||
('Shaarli RSS', parse_shaarli_rss_export),
|
||||
('Medium RSS', parse_medium_rss_export),
|
||||
('Plain Text', parse_plain_text_export),
|
||||
])
|
||||
for line in text_file.readlines():
|
||||
urls = re.findall(URL_REGEX, line) if line.strip() else ()
|
||||
for url in urls:
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(datetime.now().timestamp()),
|
||||
'title': None,
|
||||
'tags': '',
|
||||
'sources': [text_file.name],
|
||||
}
|
||||
|
|
|
@ -1,10 +1,64 @@
|
|||
"""
|
||||
Patches, additions, and shortcuts for Python standard library functions.
|
||||
"""
|
||||
|
||||
### subprocess
|
||||
|
||||
from subprocess import (
|
||||
Popen,
|
||||
PIPE,
|
||||
DEVNULL,
|
||||
CompletedProcess,
|
||||
TimeoutExpired,
|
||||
CalledProcessError,
|
||||
)
|
||||
|
||||
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
||||
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
||||
|
||||
if input is not None:
|
||||
if 'stdin' in kwargs:
|
||||
raise ValueError('stdin and input arguments may not both be used.')
|
||||
kwargs['stdin'] = PIPE
|
||||
|
||||
if capture_output:
|
||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||
raise ValueError('stdout and stderr arguments may not be used '
|
||||
'with capture_output.')
|
||||
kwargs['stdout'] = PIPE
|
||||
kwargs['stderr'] = PIPE
|
||||
|
||||
with Popen(*popenargs, **kwargs) as process:
|
||||
try:
|
||||
stdout, stderr = process.communicate(input, timeout=timeout)
|
||||
except TimeoutExpired:
|
||||
process.kill()
|
||||
try:
|
||||
stdout, stderr = process.communicate(input, timeout=2)
|
||||
except:
|
||||
pass
|
||||
raise TimeoutExpired(popenargs[0][0], timeout)
|
||||
except BaseException as err:
|
||||
process.kill()
|
||||
# We don't call process.wait() as .__exit__ does that for us.
|
||||
raise
|
||||
retcode = process.poll()
|
||||
if check and retcode:
|
||||
raise CalledProcessError(retcode, process.args,
|
||||
output=stdout, stderr=stderr)
|
||||
return CompletedProcess(process.args, retcode, stdout, stderr)
|
||||
|
||||
|
||||
|
||||
### collections
|
||||
|
||||
from sys import maxsize
|
||||
from itertools import islice
|
||||
from collections import deque
|
||||
|
||||
_marker = object()
|
||||
|
||||
class Peekable(object):
|
||||
class PeekableGenerator:
|
||||
"""Peekable version of a normal python generator.
|
||||
Useful when you don't want to evaluate the entire iterable to look at
|
||||
a specific item at a given idx.
|
||||
|
@ -74,8 +128,6 @@ class Peekable(object):
|
|||
|
||||
return next(self._it)
|
||||
|
||||
next = __next__ # For Python 2 compatibility
|
||||
|
||||
def _get_slice(self, index):
|
||||
# Normalize the slice's arguments
|
||||
step = 1 if (index.step is None) else index.step
|
|
@ -192,22 +192,27 @@
|
|||
Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
|
||||
|
|
||||
Last updated: <small title="Timestamp: $updated">$updated_date</small>
|
||||
|
|
||||
Total files: <small title="Archive methods">🗃 $num_outputs</small>
|
||||
</div>
|
||||
<div class="col-lg-4 alert well">
|
||||
Type:
|
||||
<span class="badge badge-default">$extension</span>
|
||||
|
|
||||
Tags:
|
||||
<span class="badge badge-success">$tags</span>
|
||||
<span class="badge badge-warning">$tags</span>
|
||||
|
|
||||
Status:
|
||||
<span class="badge badge-$status_color">$status</span>
|
||||
</div>
|
||||
<div class="col-lg-4 alert well">
|
||||
Download:
|
||||
Archive Methods:
|
||||
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
|
||||
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
|
||||
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
|
||||
<a href="git/" title="Any git repos at the url">Git Repos</a> |
|
||||
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
|
||||
<a href="." title="Webserver-provided index of files directory.">More files...</a>
|
||||
<a href="." title="Webserver-provided index of files directory.">See all files...</a>
|
||||
</div>
|
||||
<hr/>
|
||||
<div class="col-lg-2">
|
||||
|
|
|
@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote
|
|||
from decimal import Decimal
|
||||
from datetime import datetime
|
||||
from multiprocessing import Process
|
||||
from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
|
||||
|
||||
from stdlib_patches import run, PIPE, DEVNULL
|
||||
from config import (
|
||||
ANSI,
|
||||
TERM_WIDTH,
|
||||
|
@ -19,8 +19,6 @@ from config import (
|
|||
OUTPUT_PERMISSIONS,
|
||||
TIMEOUT,
|
||||
SHOW_PROGRESS,
|
||||
CHECK_SSL_VALIDITY,
|
||||
WGET_USER_AGENT,
|
||||
CURL_BINARY,
|
||||
WGET_BINARY,
|
||||
CHROME_BINARY,
|
||||
|
@ -37,6 +35,13 @@ from config import (
|
|||
FETCH_MEDIA,
|
||||
SUBMIT_ARCHIVE_DOT_ORG,
|
||||
ARCHIVE_DIR_NAME,
|
||||
RESOLUTION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
WGET_USER_AGENT,
|
||||
CHROME_USER_AGENT,
|
||||
CHROME_USER_DATA_DIR,
|
||||
CHROME_HEADLESS,
|
||||
CHROME_SANDBOX,
|
||||
)
|
||||
|
||||
### Parsing Helpers
|
||||
|
@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen
|
|||
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||
|
||||
short_ts = lambda ts: ts.split('.')[0]
|
||||
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
||||
|
||||
URL_REGEX = re.compile(
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
|
@ -109,66 +115,74 @@ def check_links_structure(links):
|
|||
def check_dependencies():
|
||||
"""Check that all necessary dependencies are installed, and have valid versions"""
|
||||
|
||||
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
||||
if python_vers < 3.5:
|
||||
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
||||
print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
|
||||
try:
|
||||
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
||||
if python_vers < 3.5:
|
||||
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
||||
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_WGET or FETCH_WARC:
|
||||
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
||||
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
||||
try:
|
||||
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
||||
version_str = result.stdout.decode('utf-8')
|
||||
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
|
||||
version = [l for l in version_lines if l.isdigit()][-1]
|
||||
if int(version) < 59:
|
||||
print(version_lines)
|
||||
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
except (IndexError, TypeError, OSError):
|
||||
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_GIT:
|
||||
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_MEDIA:
|
||||
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
except (KeyboardInterrupt, Exception):
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
||||
if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_WGET or FETCH_WARC:
|
||||
if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
||||
if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
|
||||
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
||||
try:
|
||||
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
||||
version_str = result.stdout.decode('utf-8')
|
||||
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
|
||||
version = [l for l in version_lines if l.isdigit()][-1]
|
||||
if int(version) < 59:
|
||||
print(version_lines)
|
||||
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
|
||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||
raise SystemExit(1)
|
||||
except (IndexError, TypeError, OSError):
|
||||
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_GIT:
|
||||
if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_MEDIA:
|
||||
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
def check_url_parsing():
|
||||
def check_url_parsing_invariants():
|
||||
"""Check that plain text regex URL parsing works as expected"""
|
||||
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving, as the consequences could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive
|
||||
|
||||
test_urls = '''
|
||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||
|
@ -276,22 +290,9 @@ def wget_output_path(link):
|
|||
if link.get('latest', {}).get('wget'):
|
||||
return link['latest']['wget']
|
||||
|
||||
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return urlencode(without_scheme(without_fragment(link['url'])))
|
||||
|
||||
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
||||
# instead of trying to emulate it here, we just look in the output folder
|
||||
# to see what html file wget actually created as the output
|
||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||
full_path = without_fragment(without_query(path(link['url']))).strip('/')
|
||||
search_dir = os.path.join(
|
||||
link_dir,
|
||||
domain(link['url']),
|
||||
full_path,
|
||||
)
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url
|
||||
# https://example.com
|
||||
# > output/archive/<timestamp>/example.com/index.html
|
||||
|
@ -304,6 +305,19 @@ def wget_output_path(link):
|
|||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments or extensions like shtml / htm
|
||||
|
||||
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
||||
# and there's no way to get the computed output path from wget
|
||||
# in order to avoid having to reverse-engineer how they calculate it,
|
||||
# we just look in the output folder read the filename wget used from the filesystem
|
||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||
full_path = without_fragment(without_query(path(link['url']))).strip('/')
|
||||
search_dir = os.path.join(
|
||||
link_dir,
|
||||
domain(link['url']),
|
||||
full_path,
|
||||
)
|
||||
|
||||
for _ in range(4):
|
||||
if os.path.exists(search_dir):
|
||||
if os.path.isdir(search_dir):
|
||||
|
@ -356,47 +370,6 @@ def str_between(string, start, end=None):
|
|||
|
||||
return content
|
||||
|
||||
def pretty_path(path):
|
||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||
return path.replace(REPO_DIR + '/', '')
|
||||
|
||||
|
||||
def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
|
||||
"""quote the argument with whitespace in a command so the user can
|
||||
copy-paste the outputted string directly to run the cmd
|
||||
"""
|
||||
|
||||
# Prettify CMD string and make it save to copy-paste by quoting arguments
|
||||
quoted_cmd = ' '.join(
|
||||
'"{}"'.format(arg) if ' ' in arg else arg
|
||||
for arg in cmd
|
||||
)
|
||||
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = hints or getattr(err, 'hints', None)
|
||||
if hints:
|
||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
)
|
||||
else:
|
||||
hints = ()
|
||||
|
||||
output_lines = [
|
||||
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
|
||||
*hints,
|
||||
'Run to see full output:'
|
||||
' cd {};'.format(pwd),
|
||||
' {}'.format(quoted_cmd),
|
||||
]
|
||||
|
||||
return '\n'.join(
|
||||
'{}{}'.format(prefix, line)
|
||||
for line in output_lines
|
||||
if line
|
||||
)
|
||||
|
||||
|
||||
### Link Helpers
|
||||
|
||||
|
@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
|
|||
print(' ', chmod_result.stderr.decode())
|
||||
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
||||
|
||||
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
||||
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
||||
|
||||
if input is not None:
|
||||
if 'stdin' in kwargs:
|
||||
raise ValueError('stdin and input arguments may not both be used.')
|
||||
kwargs['stdin'] = PIPE
|
||||
CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
|
||||
|
||||
if capture_output:
|
||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||
raise ValueError('stdout and stderr arguments may not be used '
|
||||
'with capture_output.')
|
||||
kwargs['stdout'] = PIPE
|
||||
kwargs['stderr'] = PIPE
|
||||
def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
|
||||
headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
|
||||
check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
|
||||
resolution=RESOLUTION, timeout=TIMEOUT):
|
||||
"""helper to build up a chrome shell command with arguments"""
|
||||
|
||||
with Popen(*popenargs, **kwargs) as process:
|
||||
try:
|
||||
stdout, stderr = process.communicate(input, timeout=timeout)
|
||||
except TimeoutExpired:
|
||||
process.kill()
|
||||
try:
|
||||
stdout, stderr = process.communicate(input, timeout=2)
|
||||
except:
|
||||
pass
|
||||
raise TimeoutExpired(popenargs[0][0], timeout)
|
||||
except BaseException as err:
|
||||
process.kill()
|
||||
# We don't call process.wait() as .__exit__ does that for us.
|
||||
raise
|
||||
retcode = process.poll()
|
||||
if check and retcode:
|
||||
raise CalledProcessError(retcode, process.args,
|
||||
output=stdout, stderr=stderr)
|
||||
return CompletedProcess(process.args, retcode, stdout, stderr)
|
||||
global CACHED_USER_DATA_DIR
|
||||
user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
|
||||
cmd_args = [binary]
|
||||
|
||||
if headless:
|
||||
cmd_args += ('--headless',)
|
||||
|
||||
if not sandbox:
|
||||
# dont use GPU or sandbox when running inside docker container
|
||||
cmd_args += ('--no-sandbox', '--disable-gpu')
|
||||
|
||||
if not check_ssl_validity:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
||||
if user_agent:
|
||||
cmd_args += ('--user-agent={}'.format(user_agent),)
|
||||
|
||||
if resolution:
|
||||
cmd_args += ('--window-size={}'.format(RESOLUTION),)
|
||||
|
||||
if timeout:
|
||||
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
|
||||
|
||||
# Find chrome user data directory
|
||||
default_profile_paths = (
|
||||
'~/.config/chromium',
|
||||
'~/.config/google-chrome',
|
||||
'~/.config/google-chrome-beta',
|
||||
'~/.config/google-chrome-unstable',
|
||||
'~/Library/Application Support/Chromium',
|
||||
'~/Library/Application Support/Google/Chrome',
|
||||
'~/Library/Application Support/Google/Chrome Canary',
|
||||
'~/AppData/Local/Chromium/User Data',
|
||||
'~/AppData/Local/Google/Chrome/User Data',
|
||||
'~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
)
|
||||
if user_data_dir:
|
||||
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
|
||||
else:
|
||||
for path in default_profile_paths:
|
||||
full_path = os.path.expanduser(path)
|
||||
if os.path.exists(full_path):
|
||||
CACHED_USER_DATA_DIR = full_path
|
||||
cmd_args.append('--user-data-dir={}'.format(full_path))
|
||||
break
|
||||
|
||||
return cmd_args
|
||||
|
|
Loading…
Reference in a new issue