ArchiveBox/archivebox/base32_crockford.py

173 lines
5.1 KiB
Python
Raw Normal View History

"""
base32-crockford
================
A Python module implementing the alternate base32 encoding as described
by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html.
He designed the encoding to:
* Be human and machine readable
* Be compact
* Be error resistant
* Be pronounceable
It uses a symbol set of 10 digits and 22 letters, excluding I, L O and
U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1'
and 'o' is converted to '0'. Encoding uses only upper-case characters.
Hyphens may be present in symbol strings to improve readability, and
are removed when decoding.
A check symbol can be appended to a symbol string to detect errors
within the string.
"""
import re
import sys
PY3 = sys.version_info[0] == 3
if not PY3:
import string as str
__all__ = ["encode", "decode", "normalize"]
if PY3:
string_types = (str,)
else:
string_types = (basestring,) # noqa
# The encoded symbol space does not include I, L, O or U
symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ'
# These five symbols are exclusively for checksum values
check_symbols = '*~$=U'
encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols))
decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols))
normalize_symbols = str.maketrans('IiLlOo', '111100')
valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols,
re.escape(check_symbols)))
base = len(symbols)
check_base = len(symbols + check_symbols)
def encode(number, checksum=False, split=0):
"""Encode an integer into a symbol string.
A ValueError is raised on invalid input.
If checksum is set to True, a check symbol will be
calculated and appended to the string.
If split is specified, the string will be divided into
clusters of that size separated by hyphens.
The encoded string is returned.
"""
number = int(number)
if number < 0:
raise ValueError("number '%d' is not a positive integer" % number)
split = int(split)
if split < 0:
raise ValueError("split '%d' is not a positive integer" % split)
check_symbol = ''
if checksum:
check_symbol = encode_symbols[number % check_base]
if number == 0:
return '0' + check_symbol
symbol_string = ''
while number > 0:
remainder = number % base
number //= base
symbol_string = encode_symbols[remainder] + symbol_string
symbol_string = symbol_string + check_symbol
if split:
chunks = []
for pos in range(0, len(symbol_string), split):
chunks.append(symbol_string[pos:pos + split])
symbol_string = '-'.join(chunks)
return symbol_string
def decode(symbol_string, checksum=False, strict=False):
"""Decode an encoded symbol string.
If checksum is set to True, the string is assumed to have a
trailing check symbol which will be validated. If the
checksum validation fails, a ValueError is raised.
If strict is set to True, a ValueError is raised if the
normalization step requires changes to the string.
The decoded string is returned.
"""
symbol_string = normalize(symbol_string, strict=strict)
if checksum:
symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1]
number = 0
for symbol in symbol_string:
number = number * base + decode_symbols[symbol]
if checksum:
check_value = decode_symbols[check_symbol]
modulo = number % check_base
if check_value != modulo:
raise ValueError("invalid check symbol '%s' for string '%s'" %
(check_symbol, symbol_string))
return number
def normalize(symbol_string, strict=False):
"""Normalize an encoded symbol string.
Normalization provides error correction and prepares the
string for decoding. These transformations are applied:
1. Hyphens are removed
2. 'I', 'i', 'L' or 'l' are converted to '1'
3. 'O' or 'o' are converted to '0'
4. All characters are converted to uppercase
A TypeError is raised if an invalid string type is provided.
A ValueError is raised if the normalized string contains
invalid characters.
If the strict parameter is set to True, a ValueError is raised
if any of the above transformations are applied.
The normalized string is returned.
"""
if isinstance(symbol_string, string_types):
if not PY3:
try:
symbol_string = symbol_string.encode('ascii')
except UnicodeEncodeError:
raise ValueError("string should only contain ASCII characters")
else:
raise TypeError("string is of invalid type %s" %
symbol_string.__class__.__name__)
norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper()
if not valid_symbols.match(norm_string):
raise ValueError("string '%s' contains invalid characters" % norm_string)
if strict and norm_string != symbol_string:
raise ValueError("string '%s' requires normalization" % symbol_string)
return norm_string