src/allmydata/util/base62.py

   1 #!/usr/bin/env python
   2
   3 # from the Python Standard Library
   4 import string
   5
   6 from allmydata.util.mathutil import log_ceil, log_floor
   7
   8 chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
   9
  10 BASE62CHAR = '[' + chars + ']'
  11
  12 vals = ''.join([chr(i) for i in range(62)])
  13 c2vtranstable = string.maketrans(chars, vals)
  14 v2ctranstable = string.maketrans(vals, chars)
  15 identitytranstable = string.maketrans(chars, chars)
  16
  17 def b2a(os):
  18     """
  19     @param os the data to be encoded (a string)
  20
  21     @return the contents of os in base-62 encoded form
  22     """
  23     cs = b2a_l(os, len(os)*8)
  24     assert num_octets_that_encode_to_this_many_chars(len(cs)) == len(os), "%s != %s, numchars: %s" % (num_octets_that_encode_to_this_many_chars(len(cs)), len(os), len(cs))
  25     return cs
  26
  27 def b2a_l(os, lengthinbits):
  28     """
  29     @param os the data to be encoded (a string)
  30     @param lengthinbits the number of bits of data in os to be encoded
  31
  32     b2a_l() will generate a base-62 encoded string big enough to encode
  33     lengthinbits bits.  So for example if os is 3 bytes long and lengthinbits is
  34     17, then b2a_l() will generate a 3-character- long base-62 encoded string
  35     (since 3 chars is sufficient to encode more than 2^17 values).  If os is 3
  36     bytes long and lengthinbits is 18 (or None), then b2a_l() will generate a
  37     4-character string (since 4 chars are required to hold 2^18 values).  Note
  38     that if os is 3 bytes long and lengthinbits is 17, the least significant 7
  39     bits of os are ignored.
  40
  41     Warning: if you generate a base-62 encoded string with b2a_l(), and then someone else tries to
  42     decode it by calling a2b() instead of  a2b_l(), then they will (potentially) get a different
  43     string than the one you encoded!  So use b2a_l() only when you are sure that the encoding and
  44     decoding sides know exactly which lengthinbits to use.  If you do not have a way for the
  45     encoder and the decoder to agree upon the lengthinbits, then it is best to use b2a() and
  46     a2b().  The only drawback to using b2a() over b2a_l() is that when you have a number of
  47     bits to encode that is not a multiple of 8, b2a() can sometimes generate a base-62 encoded
  48     string that is one or two characters longer than necessary.
  49
  50     @return the contents of os in base-62 encoded form
  51     """
  52     os = [ord(o) for o in reversed(os)] # treat os as big-endian -- and we want to process the least-significant o first
  53
  54     value = 0
  55     numvalues = 1 # the number of possible values that value could be
  56     for o in os:
  57         o *= numvalues
  58         value += o
  59         numvalues *= 256
  60
  61     chars = []
  62     while numvalues > 0:
  63         chars.append(value % 62)
  64         value //= 62
  65         numvalues //= 62
  66
  67     return string.translate(''.join([chr(c) for c in reversed(chars)]), v2ctranstable) # make it big-endian
  68
  69 def num_octets_that_encode_to_this_many_chars(numcs):
  70     return log_floor(62**numcs, 256)
  71
  72 def num_chars_that_this_many_octets_encode_to(numos):
  73     return log_ceil(256**numos, 62)
  74
  75 def a2b(cs):
  76     """
  77     @param cs the base-62 encoded data (a string)
  78     """
  79     return a2b_l(cs, num_octets_that_encode_to_this_many_chars(len(cs))*8)
  80
  81 def a2b_l(cs, lengthinbits):
  82     """
  83     @param lengthinbits the number of bits of data in encoded into cs
  84
  85     a2b_l() will return a result just big enough to hold lengthinbits bits.  So
  86     for example if cs is 2 characters long (encoding between 5 and 12 bits worth
  87     of data) and lengthinbits is 8, then a2b_l() will return a string of length
  88     1 (since 1 byte is sufficient to store 8 bits), but if lengthinbits is 9,
  89     then a2b_l() will return a string of length 2.
  90
  91     Please see the warning in the docstring of b2a_l() regarding the use of
  92     b2a() versus b2a_l().
  93
  94     @return the data encoded in cs
  95     """
  96     cs = [ord(c) for c in reversed(string.translate(cs, c2vtranstable))] # treat cs as big-endian -- and we want to process the least-significant c first
  97
  98     value = 0
  99     numvalues = 1 # the number of possible values that value could be
 100     for c in cs:
 101         c *= numvalues
 102         value += c
 103         numvalues *= 62
 104
 105     numvalues = 2**lengthinbits
 106     bytes = []
 107     while numvalues > 1:
 108         bytes.append(value % 256)
 109         value //= 256
 110         numvalues //= 256
 111
 112     return ''.join([chr(b) for b in reversed(bytes)]) # make it big-endian