ArchiveBox/archivebox/util.py

__package__ = 'archivebox'

import re
from pathlib import Path
import json as pyjson


from typing import List, Optional, Any
from inspect import signature
from functools import wraps
from hashlib import sha256
from urllib.parse import urlparse, quote, unquote
from html import escape, unescape
from datetime import datetime
from dateparser import parse as dateparser

import requests
from requests.exceptions import RequestException, ReadTimeout
from .base32_crockford import encode as base32_encode                            # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding

try:
    import chardet
    detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
except ImportError:
    detect_encoding = lambda rawdata: "utf-8"

### Parsing Helpers

# All of these are (str) -> str
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
scheme = lambda url: urlparse(url).scheme.lower()
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
path = lambda url: urlparse(url).path
basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
domain = lambda url: urlparse(url).netloc
query = lambda url: urlparse(url).query
fragment = lambda url: urlparse(url).fragment
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links

without_www = lambda url: url.replace('://www.', '://', 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]

urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
urldecode = lambda s: s and unquote(s)
htmlencode = lambda s: s and escape(s, quote=True)
htmldecode = lambda s: s and unescape(s)

short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
ts_to_date = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()


URL_REGEX = re.compile(
    r'http[s]?://'                    # start matching from allowed schemes
    r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
    r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
    r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
    r'[^\]\[\(\)<>"\'\s]+',         # stop parsing at these symbols
    re.IGNORECASE,
)

COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')

def is_static_file(url: str):
    # TODO: the proper way is with MIME type detection + ext, not only extension
    from .config import STATICFILE_EXTENSIONS
    return extension(url).lower() in STATICFILE_EXTENSIONS


def enforce_types(func):
    """
    Enforce function arg and kwarg types at runtime using its python3 type hints
    """
    # TODO: check return type as well

    @wraps(func)
    def typechecked_function(*args, **kwargs):
        sig = signature(func)

        def check_argument_type(arg_key, arg_val):
            try:
                annotation = sig.parameters[arg_key].annotation
            except KeyError:
                annotation = None

            if annotation is not None and annotation.__class__ is type:
                if not isinstance(arg_val, annotation):
                    raise TypeError(
                        '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
                            func.__name__,
                            arg_key,
                            annotation.__name__,
                            type(arg_val).__name__,
                            arg_key,
                            str(arg_val)[:64],
                        )
                    )

        # check args
        for arg_val, arg_key in zip(args, sig.parameters):
            check_argument_type(arg_key, arg_val)

        # check kwargs
        for arg_key, arg_val in kwargs.items():
            check_argument_type(arg_key, arg_val)

        return func(*args, **kwargs)

    return typechecked_function


def docstring(text: Optional[str]):
    """attach the given docstring to the decorated function"""
    def decorator(func):
        if text:
            func.__doc__ = text
        return func
    return decorator


@enforce_types
def str_between(string: str, start: str, end: str=None) -> str:
    """(<abc>12345</def>, <abc>, </def>)  ->  12345"""

    content = string.split(start, 1)[-1]
    if end is not None:
        content = content.rsplit(end, 1)[0]

    return content


@enforce_types
def parse_date(date: Any) -> Optional[datetime]:
    """Parse unix timestamps, iso format, and human-readable strings"""
    
    if date is None:
        return None

    if isinstance(date, datetime):
        return date
    
    if isinstance(date, (float, int)):
        date = str(date)

    if isinstance(date, str):
        return dateparser(date)

    raise ValueError('Tried to parse invalid date! {}'.format(date))


@enforce_types
def download_url(url: str, timeout: int=None) -> str:
    """Download the contents of a remote url and return the text"""
    from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
    timeout = timeout or TIMEOUT
    response = requests.get(
        url,
        headers={'User-Agent': WGET_USER_AGENT},
        verify=CHECK_SSL_VALIDITY,
        timeout=timeout,
    )

    content_type = response.headers.get('Content-Type', '')
    encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)

    if encoding is not None:
        response.encoding = encoding

    return response.text

@enforce_types
def get_headers(url: str, timeout: int=None) -> str:
    """Download the contents of a remote url and return the headers"""
    from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
    timeout = timeout or TIMEOUT

    try:
        response = requests.head(
            url,
            headers={'User-Agent': WGET_USER_AGENT},
            verify=CHECK_SSL_VALIDITY,
            timeout=timeout,
            allow_redirects=True,
        )
        if response.status_code >= 400:
            raise RequestException
    except ReadTimeout:
        raise
    except RequestException:
        response = requests.get(
            url,
            headers={'User-Agent': WGET_USER_AGENT},
            verify=CHECK_SSL_VALIDITY,
            timeout=timeout,
            stream=True
        )
    
    return pyjson.dumps(dict(response.headers), indent=4)


@enforce_types
def chrome_args(**options) -> List[str]:
    """helper to build up a chrome shell command with arguments"""

    from .config import CHROME_OPTIONS

    options = {**CHROME_OPTIONS, **options}

    cmd_args = [options['CHROME_BINARY']]

    if options['CHROME_HEADLESS']:
        cmd_args += ('--headless',)
    
    if not options['CHROME_SANDBOX']:
        # assume this means we are running inside a docker container
        # in docker, GPU support is limited, sandboxing is unecessary, 
        # and SHM is limited to 64MB by default (which is too low to be usable).
        cmd_args += (
            '--no-sandbox',
            '--disable-gpu',
            '--disable-dev-shm-usage',
            '--disable-software-rasterizer',
        )


    if not options['CHECK_SSL_VALIDITY']:
        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')

    if options['CHROME_USER_AGENT']:
        cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)

    if options['RESOLUTION']:
        cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)

    if options['TIMEOUT']:
        cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)

    if options['CHROME_USER_DATA_DIR']:
        cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
    
    return cmd_args


def ansi_to_html(text):
    """
    Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
    """
    from .config import COLOR_DICT

    TEMPLATE = '<span style="color: rgb{}"><br>'
    text = text.replace('[m', '</span>')

    def single_sub(match):
        argsdict = match.groupdict()
        if argsdict['arg_3'] is None:
            if argsdict['arg_2'] is None:
                _, color = 0, argsdict['arg_1']
            else:
                _, color = argsdict['arg_1'], argsdict['arg_2']
        else:
            _, color = argsdict['arg_3'], argsdict['arg_2']

        return TEMPLATE.format(COLOR_DICT[color][0])

    return COLOR_REGEX.sub(single_sub, text)


class AttributeDict(dict):
    """Helper to allow accessing dict values via Example.key or Example['key']"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Recursively convert nested dicts to AttributeDicts (optional):
        # for key, val in self.items():
        #     if isinstance(val, dict) and type(val) is not AttributeDict:
        #         self[key] = AttributeDict(val)

    def __getattr__(self, attr: str) -> Any:
        return dict.__getitem__(self, attr)

    def __setattr__(self, attr: str, value: Any) -> None:
        return dict.__setitem__(self, attr, value)


class ExtendedEncoder(pyjson.JSONEncoder):
    """
    Extended json serializer that supports serializing several model
    fields and objects
    """

    def default(self, obj):
        cls_name = obj.__class__.__name__

        if hasattr(obj, '_asdict'):
            return obj._asdict()

        elif isinstance(obj, bytes):
            return obj.decode()

        elif isinstance(obj, datetime):
            return obj.isoformat()

        elif isinstance(obj, Exception):
            return '{}: {}'.format(obj.__class__.__name__, obj)
        
        elif isinstance(obj, Path):
            return str(obj)
        
        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
            return tuple(obj)

        return pyjson.JSONEncoder.default(self, obj)
fix: Change import that was not working 2020-07-31 08:32:29 -05:00			`__package__ = 'archivebox'`

refactoring and fancy new link index 2017-10-23 04:58:41 -05:00			`import re`
test: Fix tests post-rebase 2020-09-15 14:05:48 -05:00			`from pathlib import Path`
Merge branch 'master' into django 2020-06-25 21:30:29 -04:00			`import json as pyjson`
split up utils into separate files 2019-04-30 23:13:04 -04:00

			`from typing import List, Optional, Any`
0 mypy errors 2019-03-30 21:29:16 -04:00			`from inspect import signature`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00			`from functools import wraps`
switch to dataclasses, working Link type hints everywhere 2019-03-26 19:21:34 -04:00			`from hashlib import sha256`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 05:33:34 -04:00			`from urllib.parse import urlparse, quote, unquote`
			`from html import escape, unescape`
major refactor + ability to handle http downloads 2017-10-18 17:38:17 -05:00			`from datetime import datetime`
use dateparser for parsing, let it handle error 2020-07-16 19:35:13 -04:00			`from dateparser import parse as dateparser`
major refactor + ability to handle http downloads 2017-10-18 17:38:17 -05:00
use requests.get to fetch and decode instead of urllib 2020-06-30 05:55:54 -04:00			`import requests`
nicer timeout hints 2020-10-31 07:56:51 -04:00			`from requests.exceptions import RequestException, ReadTimeout`
add packaging setup with stdeb for debian and apt vendor the base32_crockford lib add build script for debain packages 2020-11-23 16:52:15 -05:00			`from .base32_crockford import encode as base32_encode # type: ignore`
fix: Use w3lib to improve the encoding extraction 2020-07-22 10:24:08 -05:00			`from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding`
switch to dataclasses, working Link type hints everywhere 2019-03-26 19:21:34 -04:00
guess encoding via chardet if available 2020-02-15 13:31:27 +01:00			`try:`
			`import chardet`
			`detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]`
			`except ImportError:`
			`detect_encoding = lambda rawdata: "utf-8"`

remove dead code and cleanup utils file 2019-03-08 17:01:15 -05:00			`### Parsing Helpers`

switch to strict type hints with NamedTuples instead of dicts 2019-03-26 05:33:34 -04:00			`# All of these are (str) -> str`
			`# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing`
switch to dataclasses, working Link type hints everywhere 2019-03-26 19:21:34 -04:00			`scheme = lambda url: urlparse(url).scheme.lower()`
use urllib for url parsing instead of hand written string commands 2019-02-19 01:45:19 -05:00			`without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')`
			`without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')`
			`without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')`
			`without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')`
			`path = lambda url: urlparse(url).path`
			`basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]`
			`domain = lambda url: urlparse(url).netloc`
			`query = lambda url: urlparse(url).query`
			`fragment = lambda url: urlparse(url).fragment`
			`extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''`
dont remove query when uniquifying links 2018-04-17 03:54:59 -04:00			`base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links`
refactoring and fancy new link index 2017-10-23 04:58:41 -05:00
switch to dataclasses, working Link type hints everywhere 2019-03-26 19:21:34 -04:00			`without_www = lambda url: url.replace('://www.', '://', 1)`
			`without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')`
wip initial django setup 2019-04-02 16:36:41 -04:00			`hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]`
switch to dataclasses, working Link type hints everywhere 2019-03-26 19:21:34 -04:00
			`urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')`
			`urldecode = lambda s: s and unquote(s)`
			`htmlencode = lambda s: s and escape(s, quote=True)`
			`htmldecode = lambda s: s and unescape(s)`

wip initial django setup 2019-04-02 16:36:41 -04:00			`short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]`
			`ts_to_date = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')`
			`ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()`

refactoring and fancy new link index 2017-10-23 04:58:41 -05:00
new compiled URL regex with better markdown support 2019-02-27 04:49:25 -05:00			`URL_REGEX = re.compile(`
			`r'http[s]?://' # start matching from allowed schemes`
			`r'(?:[a-zA-Z]\|[0-9]' # followed by allowed alphanum characters`
			`r'\|[$-_@.&+]\|[!*\(\),]' # or allowed symbols`
			`r'\|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes`
fix url parsing through quotes 2020-08-18 08:04:57 -04:00			`r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols`
new compiled URL regex with better markdown support 2019-02-27 04:49:25 -05:00			`re.IGNORECASE,`
			`)`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 05:33:34 -04:00
feat: Add stdout from process to the template 2020-07-01 12:23:59 -05:00			`COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')`
fix git conflict commited by accident 2020-07-02 03:22:37 -04:00
remove circular import possibilities 2020-07-02 03:12:30 -04:00			`def is_static_file(url: str):`
			`# TODO: the proper way is with MIME type detection + ext, not only extension`
			`from .config import STATICFILE_EXTENSIONS`
			`return extension(url).lower() in STATICFILE_EXTENSIONS`
feat: Add stdout from process to the template 2020-07-01 12:23:59 -05:00
add mypy type hints 2019-03-26 03:20:41 -04:00
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00			`def enforce_types(func):`
			`"""`
full type-hinting coverage 2019-03-26 23:25:07 -04:00			`Enforce function arg and kwarg types at runtime using its python3 type hints`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00			`"""`
full type-hinting coverage 2019-03-26 23:25:07 -04:00			`# TODO: check return type as well`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00
			`@wraps(func)`
			`def typechecked_function(args, *kwargs):`
			`sig = signature(func)`

			`def check_argument_type(arg_key, arg_val):`
			`try:`
			`annotation = sig.parameters[arg_key].annotation`
			`except KeyError:`
0 mypy errors 2019-03-30 21:29:16 -04:00			`annotation = None`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00
0 mypy errors 2019-03-30 21:29:16 -04:00			`if annotation is not None and annotation.__class__ is type:`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00			`if not isinstance(arg_val, annotation):`
			`raise TypeError(`
			`'{}(..., {}: {}) got unexpected {} argument {}={}'.format(`
			`func.__name__,`
			`arg_key,`
			`annotation.__name__,`
			`type(arg_val).__name__,`
			`arg_key,`
limit length of stringified arg_vals in exceptions 2019-03-27 18:26:22 -04:00			`str(arg_val)[:64],`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00			`)`
			`)`

			`# check args`
			`for arg_val, arg_key in zip(args, sig.parameters):`
			`check_argument_type(arg_key, arg_val)`

			`# check kwargs`
			`for arg_key, arg_val in kwargs.items():`
			`check_argument_type(arg_key, arg_val)`

			`return func(args, *kwargs)`

			`return typechecked_function`

refactoring and fancy new link index 2017-10-23 04:58:41 -05:00
split up utils into separate files 2019-04-30 23:13:04 -04:00			`def docstring(text: Optional[str]):`
			`"""attach the given docstring to the decorated function"""`
			`def decorator(func):`
			`if text:`
			`func.__doc__ = text`
			`return func`
			`return decorator`
refactoring and fancy new link index 2017-10-23 04:58:41 -05:00
remove dead code and cleanup utils file 2019-03-08 17:01:15 -05:00
full type-hinting coverage 2019-03-26 23:25:07 -04:00			`@enforce_types`
add mypy type hints 2019-03-26 03:20:41 -04:00			`def str_between(string: str, start: str, end: str=None) -> str:`
remove dead code and cleanup utils file 2019-03-08 17:01:15 -05:00			`"""(<abc>12345</def>, <abc>, </def>) -> 12345"""`

			`content = string.split(start, 1)[-1]`
			`if end is not None:`
			`content = content.rsplit(end, 1)[0]`

			`return content`


full type-hinting coverage 2019-03-26 23:25:07 -04:00			`@enforce_types`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00			`def parse_date(date: Any) -> Optional[datetime]:`
			`"""Parse unix timestamps, iso format, and human-readable strings"""`

			`if date is None:`
			`return None`
wip initial django setup 2019-04-02 16:36:41 -04:00
			`if isinstance(date, datetime):`
			`return date`
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00
			`if isinstance(date, (float, int)):`
			`date = str(date)`

			`if isinstance(date, str):`
use dateparser for parsing, let it handle error 2020-07-16 19:35:13 -04:00			`return dateparser(date)`
util.py: Use dateparser to parse date strings. 2019-08-26 17:25:22 -04:00
working runtime type casting and enforcement for a wide range of types 2019-03-26 22:26:21 -04:00			`raise ValueError('Tried to parse invalid date! {}'.format(date))`


full type-hinting coverage 2019-03-26 23:25:07 -04:00			`@enforce_types`
remove circular import possibilities 2020-07-02 03:12:30 -04:00			`def download_url(url: str, timeout: int=None) -> str:`
move dependency checking into config file 2019-03-22 22:05:45 -04:00			`"""Download the contents of a remote url and return the text"""`
remove circular import possibilities 2020-07-02 03:12:30 -04:00			`from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT`
			`timeout = timeout or TIMEOUT`
use requests.get to fetch and decode instead of urllib 2020-06-30 05:55:54 -04:00			`response = requests.get(`
			`url,`
			`headers={'User-Agent': WGET_USER_AGENT},`
			`verify=CHECK_SSL_VALIDITY,`
			`timeout=timeout,`
			`)`
fix: Use w3lib to improve the encoding extraction 2020-07-22 10:24:08 -05:00
			`content_type = response.headers.get('Content-Type', '')`
			`encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)`

			`if encoding is not None:`
			`response.encoding = encoding`

use requests.get to fetch and decode instead of urllib 2020-06-30 05:55:54 -04:00			`return response.text`
remove dead code and cleanup utils file 2019-03-08 17:01:15 -05:00
Added headers extractor 2020-09-11 09:06:52 -05:00			`@enforce_types`
			`def get_headers(url: str, timeout: int=None) -> str:`
			`"""Download the contents of a remote url and return the headers"""`
			`from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT`
			`timeout = timeout or TIMEOUT`
Replaced get method 2020-09-23 13:14:49 -05:00
			`try:`
			`response = requests.head(`
			`url,`
			`headers={'User-Agent': WGET_USER_AGENT},`
			`verify=CHECK_SSL_VALIDITY,`
			`timeout=timeout,`
nicer timeout hints 2020-10-31 07:56:51 -04:00			`allow_redirects=True,`
Replaced get method 2020-09-23 13:14:49 -05:00			`)`
fix: Improve headers handling 2020-09-24 08:37:27 -05:00			`if response.status_code >= 400:`
			`raise RequestException`
nicer timeout hints 2020-10-31 07:56:51 -04:00			`except ReadTimeout:`
			`raise`
Replaced get method 2020-09-23 13:14:49 -05:00			`except RequestException:`
			`response = requests.get(`
			`url,`
			`headers={'User-Agent': WGET_USER_AGENT},`
			`verify=CHECK_SSL_VALIDITY,`
			`timeout=timeout,`
fix: Improve headers handling 2020-09-24 08:37:27 -05:00			`stream=True`
Replaced get method 2020-09-23 13:14:49 -05:00			`)`
Added headers extractor 2020-09-11 09:06:52 -05:00
Fixed indent headers.json 2020-09-11 14:19:06 -05:00			`return pyjson.dumps(dict(response.headers), indent=4)`
Added headers extractor 2020-09-11 09:06:52 -05:00
full type-hinting coverage 2019-03-26 23:25:07 -04:00
			`@enforce_types`
add mypy type hints 2019-03-26 03:20:41 -04:00			`def chrome_args(**options) -> List[str]:`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00			`"""helper to build up a chrome shell command with arguments"""`
patch subprocess.run to have better timeout handling 2019-01-20 14:07:28 -05:00
remove circular import possibilities 2020-07-02 03:12:30 -04:00			`from .config import CHROME_OPTIONS`

better chrome options loading 2019-03-22 23:00:53 -04:00			`options = {CHROME_OPTIONS, options}`

move dependency checking into config file 2019-03-22 22:05:45 -04:00			`cmd_args = [options['CHROME_BINARY']]`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00
better chrome options loading 2019-03-22 23:00:53 -04:00			`if options['CHROME_HEADLESS']:`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00			`cmd_args += ('--headless',)`

move dependency checking into config file 2019-03-22 22:05:45 -04:00			`if not options['CHROME_SANDBOX']:`
fix docker SHM limited to 64mb chrome crash 2020-07-21 23:39:21 -04:00			`# assume this means we are running inside a docker container`
			`# in docker, GPU support is limited, sandboxing is unecessary,`
			`# and SHM is limited to 64MB by default (which is too low to be usable).`
			`cmd_args += (`
			`'--no-sandbox',`
			`'--disable-gpu',`
			`'--disable-dev-shm-usage',`
			`'--disable-software-rasterizer',`
			`)`

major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00
move dependency checking into config file 2019-03-22 22:05:45 -04:00			`if not options['CHECK_SSL_VALIDITY']:`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00			`cmd_args += ('--disable-web-security', '--ignore-certificate-errors')`

move dependency checking into config file 2019-03-22 22:05:45 -04:00			`if options['CHROME_USER_AGENT']:`
			`cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00
move dependency checking into config file 2019-03-22 22:05:45 -04:00			`if options['RESOLUTION']:`
			`cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00
move dependency checking into config file 2019-03-22 22:05:45 -04:00			`if options['TIMEOUT']:`
			`cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00
move dependency checking into config file 2019-03-22 22:05:45 -04:00			`if options['CHROME_USER_DATA_DIR']:`
			`cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))`
major codebase-wide code cleanups 2019-03-21 01:28:12 -04:00
			`return cmd_args`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 05:33:34 -04:00
remove redundant utils file 2020-11-28 02:12:27 -05:00
feat: Add stdout from process to the template 2020-07-01 12:23:59 -05:00			`def ansi_to_html(text):`
			`"""`
			`Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html`
			`"""`
remove circular import possibilities 2020-07-02 03:12:30 -04:00			`from .config import COLOR_DICT`
fix git conflict commited by accident 2020-07-02 03:22:37 -04:00
feat: Add stdout from process to the template 2020-07-01 12:23:59 -05:00			`TEMPLATE = '<span style="color: rgb{}"><br>'`
			`text = text.replace('[m', '</span>')`

			`def single_sub(match):`
			`argsdict = match.groupdict()`
			`if argsdict['arg_3'] is None:`
			`if argsdict['arg_2'] is None:`
refactor: Organize code to remove flake8 issues 2020-07-24 12:25:25 -05:00			`_, color = 0, argsdict['arg_1']`
feat: Add stdout from process to the template 2020-07-01 12:23:59 -05:00			`else:`
refactor: Organize code to remove flake8 issues 2020-07-24 12:25:25 -05:00			`_, color = argsdict['arg_1'], argsdict['arg_2']`
feat: Add stdout from process to the template 2020-07-01 12:23:59 -05:00			`else:`
refactor: Organize code to remove flake8 issues 2020-07-24 12:25:25 -05:00			`_, color = argsdict['arg_3'], argsdict['arg_2']`
feat: Add stdout from process to the template 2020-07-01 12:23:59 -05:00
			`return TEMPLATE.format(COLOR_DICT[color][0])`

			`return COLOR_REGEX.sub(single_sub, text)`

switch to strict type hints with NamedTuples instead of dicts 2019-03-26 05:33:34 -04:00
add AttributeDict 2020-07-13 11:24:49 -04:00			`class AttributeDict(dict):`
			`"""Helper to allow accessing dict values via Example.key or Example['key']"""`

			`def __init__(self, args, *kwargs):`
			`super().__init__(args, *kwargs)`
			`# Recursively convert nested dicts to AttributeDicts (optional):`
			`# for key, val in self.items():`
			`# if isinstance(val, dict) and type(val) is not AttributeDict:`
			`# self[key] = AttributeDict(val)`

			`def __getattr__(self, attr: str) -> Any:`
			`return dict.__getitem__(self, attr)`

			`def __setattr__(self, attr: str, value: Any) -> None:`
			`return dict.__setitem__(self, attr, value)`


split up utils into separate files 2019-04-30 23:13:04 -04:00			`class ExtendedEncoder(pyjson.JSONEncoder):`
switch to strict type hints with NamedTuples instead of dicts 2019-03-26 05:33:34 -04:00			`"""`
			`Extended json serializer that supports serializing several model`
			`fields and objects`
			`"""`

			`def default(self, obj):`
			`cls_name = obj.__class__.__name__`

			`if hasattr(obj, '_asdict'):`
			`return obj._asdict()`

			`elif isinstance(obj, bytes):`
			`return obj.decode()`

			`elif isinstance(obj, datetime):`
			`return obj.isoformat()`

			`elif isinstance(obj, Exception):`
			`return '{}: {}'.format(obj.__class__.__name__, obj)`
add support for Paths in json encoder 2020-09-08 17:29:43 -04:00
			`elif isinstance(obj, Path):`
			`return str(obj)`

switch to strict type hints with NamedTuples instead of dicts 2019-03-26 05:33:34 -04:00			`elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):`
			`return tuple(obj)`

split up utils into separate files 2019-04-30 23:13:04 -04:00			`return pyjson.JSONEncoder.default(self, obj)`
move everything out of legacy folder 2019-04-27 17:26:24 -04:00