add base32 crockford dependency

2024-11-22 20:23:12 +00:00 · 2019-03-26 19:21:48 -04:00 · 2019-03-26 19:21:48 -04:00 · 0d8a076c1f
commit 0d8a076c1f
parent 25a107df43
1 changed files with 172 additions and 0 deletions
--- a/archivebox/base32_crockford.py
+++ b/archivebox/base32_crockford.py
@ -0,0 +1,172 @@
+"""
+base32-crockford
+================
+
+A Python module implementing the alternate base32 encoding as described
+by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html.
+
+He designed the encoding to:
+
+   * Be human and machine readable
+   * Be compact
+   * Be error resistant
+   * Be pronounceable
+
+It uses a symbol set of 10 digits and 22 letters, excluding I, L O and
+U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1'
+and 'o' is converted to '0'. Encoding uses only upper-case characters.
+
+Hyphens may be present in symbol strings to improve readability, and
+are removed when decoding.
+
+A check symbol can be appended to a symbol string to detect errors
+within the string.
+
+"""
+
+import re
+import sys
+
+PY3 = sys.version_info[0] == 3
+
+if not PY3:
+    import string as str
+
+
+__all__ = ["encode", "decode", "normalize"]
+
+
+if PY3:
+    string_types = str,
+else:
+    string_types = basestring,
+
+# The encoded symbol space does not include I, L, O or U
+symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ'
+# These five symbols are exclusively for checksum values
+check_symbols = '*~$=U'
+
+encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols))
+decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols))
+normalize_symbols = str.maketrans('IiLlOo', '111100')
+valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols,
+                                             re.escape(check_symbols)))
+
+base = len(symbols)
+check_base = len(symbols + check_symbols)
+
+
+def encode(number, checksum=False, split=0):
+    """Encode an integer into a symbol string.
+
+    A ValueError is raised on invalid input.
+
+    If checksum is set to True, a check symbol will be
+    calculated and appended to the string.
+
+    If split is specified, the string will be divided into
+    clusters of that size separated by hyphens.
+
+    The encoded string is returned.
+    """
+    number = int(number)
+    if number < 0:
+        raise ValueError("number '%d' is not a positive integer" % number)
+
+    split = int(split)
+    if split < 0:
+        raise ValueError("split '%d' is not a positive integer" % split)
+
+    check_symbol = ''
+    if checksum:
+        check_symbol = encode_symbols[number % check_base]
+
+    if number == 0:
+        return '0' + check_symbol
+
+    symbol_string = ''
+    while number > 0:
+        remainder = number % base
+        number //= base
+        symbol_string = encode_symbols[remainder] + symbol_string
+    symbol_string = symbol_string + check_symbol
+
+    if split:
+        chunks = []
+        for pos in range(0, len(symbol_string), split):
+            chunks.append(symbol_string[pos:pos + split])
+        symbol_string = '-'.join(chunks)
+
+    return symbol_string
+
+
+def decode(symbol_string, checksum=False, strict=False):
+    """Decode an encoded symbol string.
+
+    If checksum is set to True, the string is assumed to have a
+    trailing check symbol which will be validated. If the
+    checksum validation fails, a ValueError is raised.
+
+    If strict is set to True, a ValueError is raised if the
+    normalization step requires changes to the string.
+
+    The decoded string is returned.
+    """
+    symbol_string = normalize(symbol_string, strict=strict)
+    if checksum:
+        symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1]
+
+    number = 0
+    for symbol in symbol_string:
+        number = number * base + decode_symbols[symbol]
+
+    if checksum:
+        check_value = decode_symbols[check_symbol]
+        modulo = number % check_base
+        if check_value != modulo:
+            raise ValueError("invalid check symbol '%s' for string '%s'" %
+                             (check_symbol, symbol_string))
+
+    return number
+
+
+def normalize(symbol_string, strict=False):
+    """Normalize an encoded symbol string.
+
+    Normalization provides error correction and prepares the
+    string for decoding. These transformations are applied:
+
+       1. Hyphens are removed
+       2. 'I', 'i', 'L' or 'l' are converted to '1'
+       3. 'O' or 'o' are converted to '0'
+       4. All characters are converted to uppercase
+
+    A TypeError is raised if an invalid string type is provided.
+
+    A ValueError is raised if the normalized string contains
+    invalid characters.
+
+    If the strict parameter is set to True, a ValueError is raised
+    if any of the above transformations are applied.
+
+    The normalized string is returned.
+    """
+    if isinstance(symbol_string, string_types):
+        if not PY3:
+            try:
+                symbol_string = symbol_string.encode('ascii')
+            except UnicodeEncodeError:
+                raise ValueError("string should only contain ASCII characters")
+    else:
+        raise TypeError("string is of invalid type %s" %
+                        symbol_string.__class__.__name__)
+
+    norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper()
+
+    if not valid_symbols.match(norm_string):
+        raise ValueError("string '%s' contains invalid characters" % norm_string)
+
+    if strict and norm_string != symbol_string:
+        raise ValueError("string '%s' requires normalization" % symbol_string)
+
+    return norm_string