2017-10-18 22:38:17 +00:00
|
|
|
import os
|
2017-10-23 09:58:41 +00:00
|
|
|
import re
|
2017-10-18 22:38:17 +00:00
|
|
|
import sys
|
2019-03-27 22:24:30 +00:00
|
|
|
import json
|
2017-10-18 22:38:17 +00:00
|
|
|
import time
|
2019-03-27 15:39:51 +00:00
|
|
|
import shutil
|
2017-10-18 22:38:17 +00:00
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
from json import JSONEncoder
|
2019-04-11 11:00:26 +00:00
|
|
|
from typing import List, Optional, Any, Union, IO
|
2019-03-31 01:29:16 +00:00
|
|
|
from inspect import signature
|
2019-03-27 02:26:21 +00:00
|
|
|
from functools import wraps
|
2019-03-26 23:21:34 +00:00
|
|
|
from hashlib import sha256
|
2019-03-08 22:01:15 +00:00
|
|
|
from urllib.request import Request, urlopen
|
2019-03-26 09:33:34 +00:00
|
|
|
from urllib.parse import urlparse, quote, unquote
|
|
|
|
from html import escape, unescape
|
2017-10-18 22:38:17 +00:00
|
|
|
from datetime import datetime
|
|
|
|
from multiprocessing import Process
|
2019-03-21 09:35:41 +00:00
|
|
|
from subprocess import (
|
|
|
|
Popen,
|
|
|
|
PIPE,
|
|
|
|
DEVNULL,
|
|
|
|
CompletedProcess,
|
|
|
|
TimeoutExpired,
|
|
|
|
CalledProcessError,
|
|
|
|
)
|
2017-10-18 22:38:17 +00:00
|
|
|
|
2019-03-31 01:29:16 +00:00
|
|
|
from base32_crockford import encode as base32_encode # type: ignore
|
2019-03-26 23:21:34 +00:00
|
|
|
|
2019-04-02 22:53:21 +00:00
|
|
|
from .schema import Link
|
|
|
|
from .config import (
|
2019-02-21 20:47:15 +00:00
|
|
|
ANSI,
|
|
|
|
TERM_WIDTH,
|
|
|
|
SOURCES_DIR,
|
|
|
|
OUTPUT_PERMISSIONS,
|
2017-10-18 22:38:17 +00:00
|
|
|
TIMEOUT,
|
|
|
|
SHOW_PROGRESS,
|
2019-02-21 20:47:15 +00:00
|
|
|
FETCH_TITLE,
|
2019-03-21 05:28:12 +00:00
|
|
|
CHECK_SSL_VALIDITY,
|
|
|
|
WGET_USER_AGENT,
|
2019-03-23 03:00:53 +00:00
|
|
|
CHROME_OPTIONS,
|
2017-10-18 22:38:17 +00:00
|
|
|
)
|
2019-04-02 22:53:21 +00:00
|
|
|
from .logs import pretty_path
|
2017-10-18 22:38:17 +00:00
|
|
|
|
2019-03-08 22:01:15 +00:00
|
|
|
### Parsing Helpers
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
# All of these are (str) -> str
|
|
|
|
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
2019-03-26 23:21:34 +00:00
|
|
|
scheme = lambda url: urlparse(url).scheme.lower()
|
2019-02-19 06:45:19 +00:00
|
|
|
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
|
|
|
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
|
|
|
without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
|
|
|
|
without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
|
|
|
|
path = lambda url: urlparse(url).path
|
|
|
|
basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
|
|
|
|
domain = lambda url: urlparse(url).netloc
|
|
|
|
query = lambda url: urlparse(url).query
|
|
|
|
fragment = lambda url: urlparse(url).fragment
|
|
|
|
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
|
2018-04-17 07:54:59 +00:00
|
|
|
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
2017-10-23 09:58:41 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
without_www = lambda url: url.replace('://www.', '://', 1)
|
|
|
|
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
2019-04-02 20:36:41 +00:00
|
|
|
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
|
2019-03-26 23:21:34 +00:00
|
|
|
|
|
|
|
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
|
|
|
|
urldecode = lambda s: s and unquote(s)
|
|
|
|
htmlencode = lambda s: s and escape(s, quote=True)
|
|
|
|
htmldecode = lambda s: s and unescape(s)
|
|
|
|
|
2019-04-02 20:36:41 +00:00
|
|
|
short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
|
|
|
|
ts_to_date = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
|
|
|
|
ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
|
|
|
|
|
2017-10-23 09:58:41 +00:00
|
|
|
|
2019-02-27 09:49:25 +00:00
|
|
|
URL_REGEX = re.compile(
|
|
|
|
r'http[s]?://' # start matching from allowed schemes
|
|
|
|
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
|
|
|
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
|
|
|
|
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
|
|
|
r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
|
|
|
|
re.IGNORECASE,
|
|
|
|
)
|
|
|
|
HTML_TITLE_REGEX = re.compile(
|
2019-03-19 22:09:06 +00:00
|
|
|
r'<title.*?>' # start matching text after <title> tag
|
2019-02-27 09:49:25 +00:00
|
|
|
r'(.[^<>]+)', # get everything up to these symbols
|
2019-03-19 22:09:06 +00:00
|
|
|
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
2019-02-27 09:49:25 +00:00
|
|
|
)
|
2019-03-21 01:11:29 +00:00
|
|
|
STATICFILE_EXTENSIONS = {
|
|
|
|
# 99.999% of the time, URLs ending in these extentions are static files
|
|
|
|
# that can be downloaded as-is, not html pages that need to be rendered
|
|
|
|
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
|
|
|
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
2019-03-26 23:21:34 +00:00
|
|
|
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
2019-03-27 03:25:07 +00:00
|
|
|
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
2019-03-21 01:11:29 +00:00
|
|
|
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
|
|
|
'atom', 'rss', 'css', 'js', 'json',
|
|
|
|
'dmg', 'iso', 'img',
|
|
|
|
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
|
|
|
|
|
|
|
# Less common extensions to consider adding later
|
|
|
|
# jar, swf, bin, com, exe, dll, deb
|
|
|
|
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
|
|
|
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
|
|
|
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
|
|
|
|
|
|
|
# Thse are always treated as pages, not as static files, never add them:
|
|
|
|
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
|
|
|
}
|
2019-01-11 09:09:39 +00:00
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 07:20:41 +00:00
|
|
|
|
2019-03-08 22:01:15 +00:00
|
|
|
### Checks & Tests
|
|
|
|
|
2019-03-27 02:26:21 +00:00
|
|
|
def enforce_types(func):
|
|
|
|
"""
|
2019-03-27 03:25:07 +00:00
|
|
|
Enforce function arg and kwarg types at runtime using its python3 type hints
|
2019-03-27 02:26:21 +00:00
|
|
|
"""
|
2019-03-27 03:25:07 +00:00
|
|
|
# TODO: check return type as well
|
2019-03-27 02:26:21 +00:00
|
|
|
|
|
|
|
@wraps(func)
|
|
|
|
def typechecked_function(*args, **kwargs):
|
|
|
|
sig = signature(func)
|
|
|
|
|
|
|
|
def check_argument_type(arg_key, arg_val):
|
|
|
|
try:
|
|
|
|
annotation = sig.parameters[arg_key].annotation
|
|
|
|
except KeyError:
|
2019-03-31 01:29:16 +00:00
|
|
|
annotation = None
|
2019-03-27 02:26:21 +00:00
|
|
|
|
2019-03-31 01:29:16 +00:00
|
|
|
if annotation is not None and annotation.__class__ is type:
|
2019-03-27 02:26:21 +00:00
|
|
|
if not isinstance(arg_val, annotation):
|
|
|
|
raise TypeError(
|
|
|
|
'{}(..., {}: {}) got unexpected {} argument {}={}'.format(
|
|
|
|
func.__name__,
|
|
|
|
arg_key,
|
|
|
|
annotation.__name__,
|
|
|
|
type(arg_val).__name__,
|
|
|
|
arg_key,
|
2019-03-27 22:26:22 +00:00
|
|
|
str(arg_val)[:64],
|
2019-03-27 02:26:21 +00:00
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
# check args
|
|
|
|
for arg_val, arg_key in zip(args, sig.parameters):
|
|
|
|
check_argument_type(arg_key, arg_val)
|
|
|
|
|
|
|
|
# check kwargs
|
|
|
|
for arg_key, arg_val in kwargs.items():
|
|
|
|
check_argument_type(arg_key, arg_val)
|
|
|
|
|
|
|
|
return func(*args, **kwargs)
|
|
|
|
|
|
|
|
return typechecked_function
|
|
|
|
|
2017-10-23 09:58:41 +00:00
|
|
|
|
2019-03-26 07:20:41 +00:00
|
|
|
def check_url_parsing_invariants() -> None:
|
2019-02-27 09:49:25 +00:00
|
|
|
"""Check that plain text regex URL parsing works as expected"""
|
2019-03-21 05:28:12 +00:00
|
|
|
|
|
|
|
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
|
|
|
# misbehaving, as the consequences could be disastrous and lead to many
|
|
|
|
# incorrect/badly parsed links being added to the archive
|
|
|
|
|
2019-02-27 09:49:25 +00:00
|
|
|
test_urls = '''
|
|
|
|
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
|
|
|
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
|
|
|
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
|
|
|
https://example4.com/what/is/happening.html
|
|
|
|
https://example5.com/
|
|
|
|
https://example6.com
|
|
|
|
|
|
|
|
<test>http://example7.com</test>
|
|
|
|
[https://example8.com/what/is/this.php?what=1]
|
|
|
|
[and http://example9.com?what=1&other=3#and-thing=2]
|
|
|
|
<what>https://example10.com#and-thing=2 "</about>
|
|
|
|
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
|
|
|
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
|
|
|
example13.bada
|
|
|
|
and example14.badb
|
|
|
|
<or>htt://example15.badc</that>
|
|
|
|
'''
|
|
|
|
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
|
|
|
|
assert len(re.findall(URL_REGEX, test_urls)) == 12
|
|
|
|
|
|
|
|
|
2019-03-08 22:01:15 +00:00
|
|
|
### Random Helpers
|
2017-10-18 22:38:17 +00:00
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
@enforce_types
|
2019-03-28 00:48:41 +00:00
|
|
|
def handle_stdin_import(raw_text: str) -> str:
|
2019-01-14 23:11:48 +00:00
|
|
|
if not os.path.exists(SOURCES_DIR):
|
|
|
|
os.makedirs(SOURCES_DIR)
|
|
|
|
|
|
|
|
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
|
|
|
|
|
|
|
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
|
|
|
|
|
2019-03-28 00:48:41 +00:00
|
|
|
atomic_write(raw_text, source_path)
|
2019-01-14 23:11:48 +00:00
|
|
|
return source_path
|
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
|
|
|
|
@enforce_types
|
2019-03-28 00:48:41 +00:00
|
|
|
def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
|
2019-03-08 22:01:15 +00:00
|
|
|
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
|
2017-10-23 09:58:41 +00:00
|
|
|
|
2018-06-11 00:52:15 +00:00
|
|
|
if not os.path.exists(SOURCES_DIR):
|
|
|
|
os.makedirs(SOURCES_DIR)
|
2017-10-23 09:58:41 +00:00
|
|
|
|
2018-06-11 00:52:15 +00:00
|
|
|
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
|
|
|
|
2019-03-28 00:48:41 +00:00
|
|
|
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
|
2017-10-18 22:38:17 +00:00
|
|
|
|
2019-03-28 00:48:41 +00:00
|
|
|
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
|
|
|
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
|
|
|
|
print('{}[*] [{}] Downloading {}{}'.format(
|
|
|
|
ANSI['green'],
|
|
|
|
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
path,
|
2019-02-21 22:52:25 +00:00
|
|
|
ANSI['reset'],
|
|
|
|
))
|
2019-03-28 00:48:41 +00:00
|
|
|
timer = TimedProgress(timeout, prefix=' ')
|
|
|
|
try:
|
|
|
|
raw_source_text = download_url(path, timeout=timeout)
|
|
|
|
timer.end()
|
|
|
|
except Exception as e:
|
|
|
|
timer.end()
|
|
|
|
print('{}[!] Failed to download {}{}\n'.format(
|
|
|
|
ANSI['red'],
|
|
|
|
path,
|
|
|
|
ANSI['reset'],
|
|
|
|
))
|
|
|
|
print(' ', e)
|
|
|
|
raise SystemExit(1)
|
|
|
|
|
|
|
|
else:
|
|
|
|
with open(path, 'r') as f:
|
|
|
|
raw_source_text = f.read()
|
|
|
|
|
|
|
|
atomic_write(raw_source_text, source_path)
|
2019-02-21 22:54:44 +00:00
|
|
|
|
2019-02-21 22:52:25 +00:00
|
|
|
print(' > {}'.format(pretty_path(source_path)))
|
2017-10-18 22:38:17 +00:00
|
|
|
|
2018-06-11 00:52:15 +00:00
|
|
|
return source_path
|
2017-10-18 22:38:17 +00:00
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
|
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
|
2019-01-11 09:09:39 +00:00
|
|
|
"""Attempt to guess a page's title by downloading the html"""
|
2019-03-21 01:11:29 +00:00
|
|
|
|
2019-02-21 17:57:26 +00:00
|
|
|
if not FETCH_TITLE:
|
|
|
|
return None
|
|
|
|
|
2019-01-11 09:09:39 +00:00
|
|
|
try:
|
2019-03-08 22:01:15 +00:00
|
|
|
html = download_url(url, timeout=timeout)
|
2019-02-21 20:47:15 +00:00
|
|
|
|
2019-02-21 22:45:28 +00:00
|
|
|
match = re.search(HTML_TITLE_REGEX, html)
|
2019-03-26 23:21:34 +00:00
|
|
|
return htmldecode(match.group(1).strip()) if match else None
|
2019-03-23 02:05:45 +00:00
|
|
|
except Exception as err: # noqa
|
2019-02-21 22:45:28 +00:00
|
|
|
# print('[!] Failed to fetch title because of {}: {}'.format(
|
|
|
|
# err.__class__.__name__,
|
|
|
|
# err,
|
|
|
|
# ))
|
2019-02-19 06:44:54 +00:00
|
|
|
return None
|
2019-01-11 09:09:39 +00:00
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
|
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def wget_output_path(link: Link) -> Optional[str]:
|
2017-10-23 09:58:41 +00:00
|
|
|
"""calculate the path to the wgetted .html file, since wget may
|
|
|
|
adjust some paths to be different than the base_url path.
|
|
|
|
|
2018-04-17 13:13:38 +00:00
|
|
|
See docs on wget --adjust-extension (-E)
|
2017-10-23 09:58:41 +00:00
|
|
|
"""
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
if is_static_file(link.url):
|
|
|
|
return without_scheme(without_fragment(link.url))
|
2018-04-17 13:13:38 +00:00
|
|
|
|
2019-03-21 09:35:41 +00:00
|
|
|
# Wget downloads can save in a number of different ways depending on the url:
|
2019-03-21 01:11:29 +00:00
|
|
|
# https://example.com
|
|
|
|
# > output/archive/<timestamp>/example.com/index.html
|
2019-03-22 19:09:39 +00:00
|
|
|
# https://example.com?v=zzVa_tX1OiI
|
|
|
|
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
|
|
|
|
# https://www.example.com/?v=zzVa_tX1OiI
|
|
|
|
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
|
|
|
|
|
2019-03-21 01:11:29 +00:00
|
|
|
# https://example.com/abc
|
|
|
|
# > output/archive/<timestamp>/example.com/abc.html
|
|
|
|
# https://example.com/abc/
|
|
|
|
# > output/archive/<timestamp>/example.com/abc/index.html
|
2019-03-22 19:09:39 +00:00
|
|
|
# https://example.com/abc?v=zzVa_tX1OiI.html
|
|
|
|
# > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
|
|
|
|
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
|
|
|
# > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
|
|
|
|
|
2019-03-21 01:11:29 +00:00
|
|
|
# https://example.com/abc/test.html
|
|
|
|
# > output/archive/<timestamp>/example.com/abc/test.html
|
2019-03-21 09:35:41 +00:00
|
|
|
# https://example.com/abc/test?v=zzVa_tX1OiI
|
|
|
|
# > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
|
|
|
|
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
|
|
|
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
2019-03-21 01:11:29 +00:00
|
|
|
|
|
|
|
# There's also lots of complexity around how the urlencoding and renaming
|
2019-03-22 19:09:39 +00:00
|
|
|
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
|
2019-03-21 05:28:12 +00:00
|
|
|
|
|
|
|
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
|
|
|
# and there's no way to get the computed output path from wget
|
|
|
|
# in order to avoid having to reverse-engineer how they calculate it,
|
|
|
|
# we just look in the output folder read the filename wget used from the filesystem
|
2019-03-26 23:21:34 +00:00
|
|
|
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
2019-03-21 05:28:12 +00:00
|
|
|
search_dir = os.path.join(
|
2019-03-26 23:21:34 +00:00
|
|
|
link.link_dir,
|
|
|
|
domain(link.url),
|
2019-03-31 02:25:10 +00:00
|
|
|
urldecode(full_path),
|
2019-03-21 05:28:12 +00:00
|
|
|
)
|
|
|
|
|
2019-03-20 06:37:27 +00:00
|
|
|
for _ in range(4):
|
|
|
|
if os.path.exists(search_dir):
|
|
|
|
if os.path.isdir(search_dir):
|
|
|
|
html_files = [
|
|
|
|
f for f in os.listdir(search_dir)
|
|
|
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
|
|
|
]
|
|
|
|
if html_files:
|
2019-03-26 23:21:34 +00:00
|
|
|
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
|
2019-03-21 09:35:41 +00:00
|
|
|
return os.path.join(path_from_link_dir, html_files[0])
|
2019-03-20 06:37:27 +00:00
|
|
|
|
|
|
|
# Move up one directory level
|
|
|
|
search_dir = search_dir.rsplit('/', 1)[0]
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
if search_dir == link.link_dir:
|
2019-03-20 06:37:27 +00:00
|
|
|
break
|
2018-04-17 21:16:29 +00:00
|
|
|
|
|
|
|
return None
|
2018-04-17 13:13:38 +00:00
|
|
|
|
2017-10-23 09:58:41 +00:00
|
|
|
|
2019-03-08 22:01:15 +00:00
|
|
|
### String Manipulation & Logging Helpers
|
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def str_between(string: str, start: str, end: str=None) -> str:
|
2019-03-08 22:01:15 +00:00
|
|
|
"""(<abc>12345</def>, <abc>, </def>) -> 12345"""
|
|
|
|
|
|
|
|
content = string.split(start, 1)[-1]
|
|
|
|
if end is not None:
|
|
|
|
content = content.rsplit(end, 1)[0]
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
@enforce_types
|
2019-03-27 02:26:21 +00:00
|
|
|
def parse_date(date: Any) -> Optional[datetime]:
|
|
|
|
"""Parse unix timestamps, iso format, and human-readable strings"""
|
|
|
|
|
|
|
|
if date is None:
|
|
|
|
return None
|
2019-04-02 20:36:41 +00:00
|
|
|
|
|
|
|
if isinstance(date, datetime):
|
|
|
|
return date
|
2019-03-27 02:26:21 +00:00
|
|
|
|
|
|
|
if isinstance(date, (float, int)):
|
|
|
|
date = str(date)
|
|
|
|
|
|
|
|
if isinstance(date, str):
|
|
|
|
if date.replace('.', '').isdigit():
|
2019-03-30 21:51:23 +00:00
|
|
|
# this is a brittle attempt at unix timestamp parsing (which is
|
|
|
|
# notoriously hard to do). It may lead to dates being off by
|
|
|
|
# anything from hours to decades, depending on which app, OS,
|
|
|
|
# and sytem time configuration was used for the original timestamp
|
|
|
|
# more info: https://github.com/pirate/ArchiveBox/issues/119
|
2019-03-30 21:57:39 +00:00
|
|
|
|
|
|
|
# Note: always always always store the original timestamp string
|
|
|
|
# somewhere indepentendly of the parsed datetime, so that later
|
|
|
|
# bugs dont repeatedly misparse and rewrite increasingly worse dates.
|
|
|
|
# the correct date can always be re-derived from the timestamp str
|
2019-03-27 02:26:21 +00:00
|
|
|
timestamp = float(date)
|
|
|
|
|
|
|
|
EARLIEST_POSSIBLE = 473403600.0 # 1985
|
|
|
|
LATEST_POSSIBLE = 1735707600.0 # 2025
|
|
|
|
|
|
|
|
if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
|
|
|
|
# number is seconds
|
|
|
|
return datetime.fromtimestamp(timestamp)
|
2019-03-30 21:59:56 +00:00
|
|
|
|
2019-03-27 02:26:21 +00:00
|
|
|
elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
|
|
|
|
# number is milliseconds
|
|
|
|
return datetime.fromtimestamp(timestamp / 1000)
|
|
|
|
|
|
|
|
elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
|
|
|
|
# number is microseconds
|
|
|
|
return datetime.fromtimestamp(timestamp / (1000*1000))
|
|
|
|
|
2019-03-30 21:57:39 +00:00
|
|
|
else:
|
|
|
|
# continue to the end and raise a parsing failed error.
|
|
|
|
# we dont want to even attempt parsing timestamp strings that
|
|
|
|
# arent within these ranges
|
|
|
|
pass
|
|
|
|
|
2019-03-27 02:26:21 +00:00
|
|
|
if '-' in date:
|
2019-04-17 03:21:24 +00:00
|
|
|
# 2019-04-07T05:44:39.227520
|
2019-03-27 02:26:21 +00:00
|
|
|
try:
|
|
|
|
return datetime.fromisoformat(date)
|
|
|
|
except Exception:
|
|
|
|
try:
|
|
|
|
return datetime.strptime(date, '%Y-%m-%d %H:%M')
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
|
|
|
raise ValueError('Tried to parse invalid date! {}'.format(date))
|
|
|
|
|
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def is_static_file(url: str) -> bool:
|
2019-03-21 01:11:29 +00:00
|
|
|
"""Certain URLs just point to a single static file, and
|
|
|
|
don't need to be re-archived in many formats
|
|
|
|
"""
|
|
|
|
|
|
|
|
# TODO: the proper way is with MIME type detection, not using extension
|
|
|
|
return extension(url) in STATICFILE_EXTENSIONS
|
2017-10-23 09:58:41 +00:00
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
|
2019-03-23 01:38:08 +00:00
|
|
|
|
2019-03-08 22:01:15 +00:00
|
|
|
### Python / System Helpers
|
|
|
|
|
2019-03-21 09:35:41 +00:00
|
|
|
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
|
|
|
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
|
|
|
|
|
|
|
if input is not None:
|
|
|
|
if 'stdin' in kwargs:
|
|
|
|
raise ValueError('stdin and input arguments may not both be used.')
|
|
|
|
kwargs['stdin'] = PIPE
|
2019-03-08 22:01:15 +00:00
|
|
|
|
2019-03-21 09:35:41 +00:00
|
|
|
if capture_output:
|
|
|
|
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
|
|
|
raise ValueError('stdout and stderr arguments may not be used '
|
|
|
|
'with capture_output.')
|
|
|
|
kwargs['stdout'] = PIPE
|
|
|
|
kwargs['stderr'] = PIPE
|
2019-03-08 22:01:15 +00:00
|
|
|
|
2019-03-21 09:35:41 +00:00
|
|
|
with Popen(*popenargs, **kwargs) as process:
|
2019-03-08 22:01:15 +00:00
|
|
|
try:
|
2019-03-21 09:35:41 +00:00
|
|
|
stdout, stderr = process.communicate(input, timeout=timeout)
|
|
|
|
except TimeoutExpired:
|
|
|
|
process.kill()
|
|
|
|
try:
|
|
|
|
stdout, stderr = process.communicate(input, timeout=2)
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
raise TimeoutExpired(popenargs[0][0], timeout)
|
|
|
|
except BaseException:
|
|
|
|
process.kill()
|
|
|
|
# We don't call process.wait() as .__exit__ does that for us.
|
|
|
|
raise
|
|
|
|
retcode = process.poll()
|
|
|
|
if check and retcode:
|
|
|
|
raise CalledProcessError(retcode, process.args,
|
|
|
|
output=stdout, stderr=stderr)
|
|
|
|
return CompletedProcess(process.args, retcode, stdout, stderr)
|
|
|
|
|
|
|
|
|
2019-03-26 07:20:41 +00:00
|
|
|
class TimedProgress:
|
|
|
|
"""Show a progress bar and measure elapsed time until .end() is called"""
|
|
|
|
|
|
|
|
def __init__(self, seconds, prefix=''):
|
|
|
|
if SHOW_PROGRESS:
|
|
|
|
self.p = Process(target=progress_bar, args=(seconds, prefix))
|
|
|
|
self.p.start()
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
self.stats = {'start_ts': datetime.now(), 'end_ts': None}
|
2019-03-26 07:20:41 +00:00
|
|
|
|
|
|
|
def end(self):
|
|
|
|
"""immediately end progress, clear the progressbar line, and save end_ts"""
|
|
|
|
|
|
|
|
end_ts = datetime.now()
|
2019-03-26 23:21:34 +00:00
|
|
|
self.stats['end_ts'] = end_ts
|
2019-03-26 07:20:41 +00:00
|
|
|
if SHOW_PROGRESS:
|
|
|
|
# protect from double termination
|
|
|
|
#if p is None or not hasattr(p, 'kill'):
|
|
|
|
# return
|
|
|
|
if self.p is not None:
|
|
|
|
self.p.terminate()
|
2019-03-27 07:49:39 +00:00
|
|
|
|
2019-03-26 07:20:41 +00:00
|
|
|
self.p = None
|
|
|
|
|
2019-03-26 09:30:23 +00:00
|
|
|
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
|
2019-03-26 07:20:41 +00:00
|
|
|
|
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def progress_bar(seconds: int, prefix: str='') -> None:
|
2019-03-21 09:35:41 +00:00
|
|
|
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
|
|
|
chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
|
2019-03-26 09:30:23 +00:00
|
|
|
chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
2019-03-21 09:35:41 +00:00
|
|
|
try:
|
|
|
|
for s in range(seconds * chunks):
|
2019-03-26 09:30:23 +00:00
|
|
|
chunks = TERM_WIDTH() - len(prefix) - 20
|
2019-03-21 09:35:41 +00:00
|
|
|
progress = s / chunks / seconds * 100
|
|
|
|
bar_width = round(progress/(100/chunks))
|
|
|
|
|
|
|
|
# ████████████████████ 0.9% (1/60sec)
|
|
|
|
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
|
2019-03-08 22:01:15 +00:00
|
|
|
prefix,
|
2019-03-21 09:35:41 +00:00
|
|
|
ANSI['green'],
|
|
|
|
(chunk * bar_width).ljust(chunks),
|
2019-03-08 22:01:15 +00:00
|
|
|
ANSI['reset'],
|
2019-03-21 09:35:41 +00:00
|
|
|
round(progress, 1),
|
|
|
|
round(s/chunks),
|
2019-03-08 22:01:15 +00:00
|
|
|
seconds,
|
|
|
|
))
|
|
|
|
sys.stdout.flush()
|
2019-03-21 09:35:41 +00:00
|
|
|
time.sleep(1 / chunks)
|
2019-03-08 22:01:15 +00:00
|
|
|
|
2019-03-21 09:35:41 +00:00
|
|
|
# ██████████████████████████████████ 100.0% (60/60sec)
|
|
|
|
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
|
|
|
|
prefix,
|
|
|
|
ANSI['red'],
|
|
|
|
chunk * chunks,
|
|
|
|
ANSI['reset'],
|
|
|
|
100.0,
|
|
|
|
seconds,
|
|
|
|
seconds,
|
|
|
|
))
|
|
|
|
sys.stdout.flush()
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print()
|
|
|
|
pass
|
|
|
|
|
2019-03-08 22:01:15 +00:00
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def download_url(url: str, timeout: int=TIMEOUT) -> str:
|
2019-03-23 02:05:45 +00:00
|
|
|
"""Download the contents of a remote url and return the text"""
|
|
|
|
|
2019-03-08 22:01:15 +00:00
|
|
|
req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
|
|
|
|
|
|
|
|
if CHECK_SSL_VALIDITY:
|
|
|
|
resp = urlopen(req, timeout=timeout)
|
|
|
|
else:
|
|
|
|
import ssl
|
|
|
|
insecure = ssl._create_unverified_context()
|
|
|
|
resp = urlopen(req, timeout=timeout, context=insecure)
|
|
|
|
|
2019-03-31 01:29:16 +00:00
|
|
|
encoding = resp.headers.get_content_charset() or 'utf-8' # type: ignore
|
2019-03-08 22:01:15 +00:00
|
|
|
return resp.read().decode(encoding)
|
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
|
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
|
2019-03-08 22:01:15 +00:00
|
|
|
"""chmod -R <permissions> <cwd>/<path>"""
|
|
|
|
|
|
|
|
if not os.path.exists(os.path.join(cwd, path)):
|
|
|
|
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
|
|
|
|
|
|
|
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
|
|
|
|
if chmod_result.returncode == 1:
|
|
|
|
print(' ', chmod_result.stderr.decode())
|
|
|
|
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
|
|
|
|
2019-01-20 19:07:28 +00:00
|
|
|
|
2019-03-27 15:39:51 +00:00
|
|
|
@enforce_types
|
|
|
|
def copy_and_overwrite(from_path: str, to_path: str):
|
|
|
|
if os.path.exists(to_path):
|
|
|
|
shutil.rmtree(to_path)
|
|
|
|
shutil.copytree(from_path, to_path)
|
|
|
|
|
2019-03-27 03:25:07 +00:00
|
|
|
@enforce_types
|
2019-03-26 07:20:41 +00:00
|
|
|
def chrome_args(**options) -> List[str]:
|
2019-03-21 05:28:12 +00:00
|
|
|
"""helper to build up a chrome shell command with arguments"""
|
2019-01-20 19:07:28 +00:00
|
|
|
|
2019-03-23 03:00:53 +00:00
|
|
|
options = {**CHROME_OPTIONS, **options}
|
|
|
|
|
2019-03-23 02:05:45 +00:00
|
|
|
cmd_args = [options['CHROME_BINARY']]
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-23 03:00:53 +00:00
|
|
|
if options['CHROME_HEADLESS']:
|
2019-03-21 05:28:12 +00:00
|
|
|
cmd_args += ('--headless',)
|
|
|
|
|
2019-03-23 02:05:45 +00:00
|
|
|
if not options['CHROME_SANDBOX']:
|
2019-03-21 05:28:12 +00:00
|
|
|
# dont use GPU or sandbox when running inside docker container
|
|
|
|
cmd_args += ('--no-sandbox', '--disable-gpu')
|
|
|
|
|
2019-03-23 02:05:45 +00:00
|
|
|
if not options['CHECK_SSL_VALIDITY']:
|
2019-03-21 05:28:12 +00:00
|
|
|
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
|
|
|
|
2019-03-23 02:05:45 +00:00
|
|
|
if options['CHROME_USER_AGENT']:
|
|
|
|
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-23 02:05:45 +00:00
|
|
|
if options['RESOLUTION']:
|
|
|
|
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-23 02:05:45 +00:00
|
|
|
if options['TIMEOUT']:
|
|
|
|
cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)
|
2019-03-21 05:28:12 +00:00
|
|
|
|
2019-03-23 02:05:45 +00:00
|
|
|
if options['CHROME_USER_DATA_DIR']:
|
|
|
|
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
2019-03-21 05:28:12 +00:00
|
|
|
|
|
|
|
return cmd_args
|
2019-03-26 09:33:34 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ExtendedEncoder(JSONEncoder):
|
|
|
|
"""
|
|
|
|
Extended json serializer that supports serializing several model
|
|
|
|
fields and objects
|
|
|
|
"""
|
|
|
|
|
|
|
|
def default(self, obj):
|
|
|
|
cls_name = obj.__class__.__name__
|
|
|
|
|
|
|
|
if hasattr(obj, '_asdict'):
|
|
|
|
return obj._asdict()
|
|
|
|
|
|
|
|
elif isinstance(obj, bytes):
|
|
|
|
return obj.decode()
|
|
|
|
|
|
|
|
elif isinstance(obj, datetime):
|
|
|
|
return obj.isoformat()
|
|
|
|
|
|
|
|
elif isinstance(obj, Exception):
|
|
|
|
return '{}: {}'.format(obj.__class__.__name__, obj)
|
|
|
|
|
|
|
|
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
|
|
|
|
return tuple(obj)
|
|
|
|
|
|
|
|
return JSONEncoder.default(self, obj)
|
2019-03-27 22:24:57 +00:00
|
|
|
|
|
|
|
|
2019-04-11 11:00:26 +00:00
|
|
|
def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> Optional[str]:
|
|
|
|
if file:
|
2019-04-17 03:21:49 +00:00
|
|
|
path = os.path.realpath(file.name)
|
|
|
|
contents = json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
|
|
|
|
atomic_write(contents, path)
|
|
|
|
return contents
|
2019-04-11 11:00:26 +00:00
|
|
|
else:
|
|
|
|
return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
|
|
|
|
|
|
|
|
|
2019-04-11 12:11:32 +00:00
|
|
|
def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
|
|
|
|
header: bool=True, ljust: int=0, separator: str=',') -> str:
|
2019-04-11 11:00:26 +00:00
|
|
|
csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
|
2019-04-11 12:11:32 +00:00
|
|
|
|
|
|
|
header_str = ''
|
|
|
|
if header:
|
|
|
|
header_str = separator.join(col.ljust(ljust) for col in csv_cols)
|
|
|
|
|
|
|
|
row_strs = (
|
|
|
|
link.to_csv(csv_cols=csv_cols, ljust=ljust, separator=separator)
|
|
|
|
for link in links
|
|
|
|
)
|
|
|
|
|
|
|
|
return '\n'.join((header_str, *row_strs))
|
2019-04-11 11:00:26 +00:00
|
|
|
|
|
|
|
|
2019-03-30 19:03:46 +00:00
|
|
|
def atomic_write(contents: Union[dict, str], path: str) -> None:
|
|
|
|
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
|
2019-03-27 22:24:57 +00:00
|
|
|
try:
|
|
|
|
tmp_file = '{}.tmp'.format(path)
|
|
|
|
with open(tmp_file, 'w+', encoding='utf-8') as f:
|
|
|
|
if isinstance(contents, dict):
|
2019-04-11 11:00:26 +00:00
|
|
|
to_json(contents, file=f)
|
2019-03-27 22:24:57 +00:00
|
|
|
else:
|
|
|
|
f.write(contents)
|
|
|
|
|
|
|
|
os.fsync(f.fileno())
|
|
|
|
|
|
|
|
os.rename(tmp_file, path)
|
|
|
|
chmod_file(path)
|
|
|
|
finally:
|
|
|
|
if os.path.exists(tmp_file):
|
|
|
|
os.remove(tmp_file)
|
2019-04-03 04:27:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
def reject_stdin(caller: str) -> None:
|
|
|
|
"""Tell the user they passed stdin to a command that doesn't accept it"""
|
|
|
|
|
|
|
|
if not sys.stdin.isatty():
|
|
|
|
stdin_raw_text = sys.stdin.read().strip()
|
|
|
|
if stdin_raw_text:
|
|
|
|
print(
|
|
|
|
'{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
|
|
|
|
caller,
|
|
|
|
**ANSI,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
print(' Run archivebox "{} --help" to see usage and examples.'.format(
|
|
|
|
caller,
|
|
|
|
))
|
|
|
|
print()
|
|
|
|
raise SystemExit(1)
|