diff --git a/archivebox/base32_crockford.py b/archivebox/base32_crockford.py new file mode 100644 index 00000000..bafb69b4 --- /dev/null +++ b/archivebox/base32_crockford.py @@ -0,0 +1,172 @@ +""" +base32-crockford +================ + +A Python module implementing the alternate base32 encoding as described +by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html. + +He designed the encoding to: + + * Be human and machine readable + * Be compact + * Be error resistant + * Be pronounceable + +It uses a symbol set of 10 digits and 22 letters, excluding I, L O and +U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1' +and 'o' is converted to '0'. Encoding uses only upper-case characters. + +Hyphens may be present in symbol strings to improve readability, and +are removed when decoding. + +A check symbol can be appended to a symbol string to detect errors +within the string. + +""" + +import re +import sys + +PY3 = sys.version_info[0] == 3 + +if not PY3: + import string as str + + +__all__ = ["encode", "decode", "normalize"] + + +if PY3: + string_types = str, +else: + string_types = basestring, + +# The encoded symbol space does not include I, L, O or U +symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ' +# These five symbols are exclusively for checksum values +check_symbols = '*~$=U' + +encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols)) +decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols)) +normalize_symbols = str.maketrans('IiLlOo', '111100') +valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols, + re.escape(check_symbols))) + +base = len(symbols) +check_base = len(symbols + check_symbols) + + +def encode(number, checksum=False, split=0): + """Encode an integer into a symbol string. + + A ValueError is raised on invalid input. + + If checksum is set to True, a check symbol will be + calculated and appended to the string. + + If split is specified, the string will be divided into + clusters of that size separated by hyphens. + + The encoded string is returned. + """ + number = int(number) + if number < 0: + raise ValueError("number '%d' is not a positive integer" % number) + + split = int(split) + if split < 0: + raise ValueError("split '%d' is not a positive integer" % split) + + check_symbol = '' + if checksum: + check_symbol = encode_symbols[number % check_base] + + if number == 0: + return '0' + check_symbol + + symbol_string = '' + while number > 0: + remainder = number % base + number //= base + symbol_string = encode_symbols[remainder] + symbol_string + symbol_string = symbol_string + check_symbol + + if split: + chunks = [] + for pos in range(0, len(symbol_string), split): + chunks.append(symbol_string[pos:pos + split]) + symbol_string = '-'.join(chunks) + + return symbol_string + + +def decode(symbol_string, checksum=False, strict=False): + """Decode an encoded symbol string. + + If checksum is set to True, the string is assumed to have a + trailing check symbol which will be validated. If the + checksum validation fails, a ValueError is raised. + + If strict is set to True, a ValueError is raised if the + normalization step requires changes to the string. + + The decoded string is returned. + """ + symbol_string = normalize(symbol_string, strict=strict) + if checksum: + symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1] + + number = 0 + for symbol in symbol_string: + number = number * base + decode_symbols[symbol] + + if checksum: + check_value = decode_symbols[check_symbol] + modulo = number % check_base + if check_value != modulo: + raise ValueError("invalid check symbol '%s' for string '%s'" % + (check_symbol, symbol_string)) + + return number + + +def normalize(symbol_string, strict=False): + """Normalize an encoded symbol string. + + Normalization provides error correction and prepares the + string for decoding. These transformations are applied: + + 1. Hyphens are removed + 2. 'I', 'i', 'L' or 'l' are converted to '1' + 3. 'O' or 'o' are converted to '0' + 4. All characters are converted to uppercase + + A TypeError is raised if an invalid string type is provided. + + A ValueError is raised if the normalized string contains + invalid characters. + + If the strict parameter is set to True, a ValueError is raised + if any of the above transformations are applied. + + The normalized string is returned. + """ + if isinstance(symbol_string, string_types): + if not PY3: + try: + symbol_string = symbol_string.encode('ascii') + except UnicodeEncodeError: + raise ValueError("string should only contain ASCII characters") + else: + raise TypeError("string is of invalid type %s" % + symbol_string.__class__.__name__) + + norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper() + + if not valid_symbols.match(norm_string): + raise ValueError("string '%s' contains invalid characters" % norm_string) + + if strict and norm_string != symbol_string: + raise ValueError("string '%s' requires normalization" % symbol_string) + + return norm_string