src/allmydata/util/encodingutil.py

   1 """
   2 Functions used to convert inputs from whatever encoding used in the system to
   3 unicode and back.
   4 """
   5
   6 import sys
   7 import os
   8 import re
   9 from allmydata.util.assertutil import precondition
  10 from twisted.python import usage
  11 import locale
  12 from allmydata.util import log
  13 from allmydata.util.fileutil import abspath_expanduser_unicode
  14
  15
  16 def canonical_encoding(encoding):
  17     if encoding is None:
  18         log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
  19         encoding = 'utf-8'
  20     encoding = encoding.lower()
  21     if encoding == "cp65001":
  22         encoding = 'utf-8'
  23     elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
  24         encoding = 'ascii'
  25
  26     return encoding
  27
  28 def check_encoding(encoding):
  29     # sometimes Python returns an encoding name that it doesn't support for conversion
  30     # fail early if this happens
  31     try:
  32         u"test".encode(encoding)
  33     except (LookupError, AttributeError):
  34         raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
  35
  36 filesystem_encoding = None
  37 io_encoding = None
  38 is_unicode_platform = False
  39
  40 def _reload():
  41     global filesystem_encoding, io_encoding, is_unicode_platform
  42
  43     filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
  44     check_encoding(filesystem_encoding)
  45
  46     if sys.platform == 'win32':
  47         # On Windows we install UTF-8 stream wrappers for sys.stdout and
  48         # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
  49         io_encoding = 'utf-8'
  50     else:
  51         ioenc = None
  52         if hasattr(sys.stdout, 'encoding'):
  53             ioenc = sys.stdout.encoding
  54         if ioenc is None:
  55             try:
  56                 ioenc = locale.getpreferredencoding()
  57             except Exception:
  58                 pass  # work around <http://bugs.python.org/issue1443504>
  59         io_encoding = canonical_encoding(ioenc)
  60
  61     check_encoding(io_encoding)
  62
  63     is_unicode_platform = sys.platform in ["win32", "darwin"]
  64
  65 _reload()
  66
  67
  68 def get_filesystem_encoding():
  69     """
  70     Returns expected encoding for local filenames.
  71     """
  72     return filesystem_encoding
  73
  74 def get_io_encoding():
  75     """
  76     Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
  77     """
  78     return io_encoding
  79
  80 def argv_to_unicode(s):
  81     """
  82     Decode given argv element to unicode. If this fails, raise a UsageError.
  83     """
  84     precondition(isinstance(s, str), s)
  85
  86     try:
  87         return unicode(s, io_encoding)
  88     except UnicodeDecodeError:
  89         raise usage.UsageError("Argument %s cannot be decoded as %s." %
  90                                (quote_output(s), io_encoding))
  91
  92 def argv_to_abspath(s):
  93     """
  94     Convenience function to decode an argv element to an absolute path, with ~ expanded.
  95     If this fails, raise a UsageError.
  96     """
  97     return abspath_expanduser_unicode(argv_to_unicode(s))
  98
  99 def unicode_to_argv(s, mangle=False):
 100     """
 101     Encode the given Unicode argument as a bytestring.
 102     If the argument is to be passed to a different process, then the 'mangle' argument
 103     should be true; on Windows, this uses a mangled encoding that will be reversed by
 104     code in runner.py.
 105     """
 106     precondition(isinstance(s, unicode), s)
 107
 108     if mangle and sys.platform == "win32":
 109         # This must be the same as 'mangle' in bin/tahoe-script.template.
 110         return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
 111     else:
 112         return s.encode(io_encoding)
 113
 114 def unicode_to_url(s):
 115     """
 116     Encode an unicode object used in an URL.
 117     """
 118     # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
 119
 120     # FIXME
 121     return to_str(s)
 122     #precondition(isinstance(s, unicode), s)
 123     #return s.encode('utf-8')
 124
 125 def to_str(s):
 126     if s is None or isinstance(s, str):
 127         return s
 128     return s.encode('utf-8')
 129
 130 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
 131 PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
 132
 133 def is_printable_ascii(s):
 134     return PRINTABLE_ASCII.search(s) is not None
 135
 136 def unicode_to_output(s):
 137     """
 138     Encode an unicode object for representation on stdout or stderr.
 139     """
 140     precondition(isinstance(s, unicode), s)
 141
 142     try:
 143         out = s.encode(io_encoding)
 144     except (UnicodeEncodeError, UnicodeDecodeError):
 145         raise UnicodeEncodeError(io_encoding, s, 0, 0,
 146                                  "A string could not be encoded as %s for output to the terminal:\n%r" %
 147                                  (io_encoding, repr(s)))
 148
 149     if PRINTABLE_8BIT.search(out) is None:
 150         raise UnicodeEncodeError(io_encoding, s, 0, 0,
 151                                  "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
 152                                  (io_encoding, repr(s)))
 153     return out
 154
 155
 156 def _unicode_escape(m, quote_newlines):
 157     u = m.group(0)
 158     if u == u'"' or u == u'$' or u == u'`' or u == u'\\':
 159         return u'\\' + u
 160     elif u == u'\n' and not quote_newlines:
 161         return u
 162     if len(u) == 2:
 163         codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
 164     else:
 165         codepoint = ord(u)
 166     if codepoint > 0xFFFF:
 167         return u'\\U%08x' % (codepoint,)
 168     elif codepoint > 0xFF:
 169         return u'\\u%04x' % (codepoint,)
 170     else:
 171         return u'\\x%02x' % (codepoint,)
 172
 173 def _str_escape(m, quote_newlines):
 174     c = m.group(0)
 175     if c == '"' or c == '$' or c == '`' or c == '\\':
 176         return '\\' + c
 177     elif c == '\n' and not quote_newlines:
 178         return c
 179     else:
 180         return '\\x%02x' % (ord(c),)
 181
 182 MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
 183 MUST_DOUBLE_QUOTE    = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
 184
 185 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
 186 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
 187                                ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
 188                                re.DOTALL)
 189
 190 ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
 191
 192 def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
 193     """
 194     Encode either a Unicode string or a UTF-8-encoded bytestring for representation
 195     on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
 196     always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
 197     control bytes in the output. (Newlines are counted as control bytes iff
 198     quote_newlines is True.)
 199
 200     Quoting may use either single or double quotes. Within single quotes, all
 201     characters stand for themselves, and ' will not appear. Within double quotes,
 202     Python-compatible backslash escaping is used.
 203
 204     If not explicitly given, quote_newlines is True when quotemarks is True.
 205     """
 206     precondition(isinstance(s, (str, unicode)), s)
 207     if quote_newlines is None:
 208         quote_newlines = quotemarks
 209
 210     if isinstance(s, str):
 211         try:
 212             s = s.decode('utf-8')
 213         except UnicodeDecodeError:
 214             return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
 215
 216     must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
 217     if must_double_quote.search(s) is None:
 218         try:
 219             out = s.encode(encoding or io_encoding)
 220             if quotemarks or out.startswith('"'):
 221                 return "'%s'" % (out,)
 222             else:
 223                 return out
 224         except (UnicodeDecodeError, UnicodeEncodeError):
 225             pass
 226
 227     escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
 228     return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
 229
 230 def quote_path(path, quotemarks=True):
 231     return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
 232
 233 def quote_local_unicode_path(path, quotemarks=True):
 234     precondition(isinstance(path, unicode), path)
 235
 236     if sys.platform == "win32" and path.startswith(u"\\\\?\\"):
 237         path = path[4 :]
 238         if path.startswith(u"UNC\\"):
 239             path = u"\\\\" + path[4 :]
 240
 241     return quote_output(path, quotemarks=quotemarks, quote_newlines=True)
 242
 243
 244 def unicode_platform():
 245     """
 246     Does the current platform handle Unicode filenames natively?
 247     """
 248     return is_unicode_platform
 249
 250 class FilenameEncodingError(Exception):
 251     """
 252     Filename cannot be encoded using the current encoding of your filesystem
 253     (%s). Please configure your locale correctly or rename this file.
 254     """
 255     pass
 256
 257 def listdir_unicode_fallback(path):
 258     """
 259     This function emulates a fallback Unicode API similar to one available
 260     under Windows or MacOS X.
 261
 262     If badly encoded filenames are encountered, an exception is raised.
 263     """
 264     precondition(isinstance(path, unicode), path)
 265
 266     try:
 267         byte_path = path.encode(filesystem_encoding)
 268     except (UnicodeEncodeError, UnicodeDecodeError):
 269         raise FilenameEncodingError(path)
 270
 271     try:
 272         return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
 273     except UnicodeDecodeError:
 274         raise FilenameEncodingError(fn)
 275
 276 def listdir_unicode(path):
 277     """
 278     Wrapper around listdir() which provides safe access to the convenient
 279     Unicode API even under platforms that don't provide one natively.
 280     """
 281     precondition(isinstance(path, unicode), path)
 282
 283     # On Windows and MacOS X, the Unicode API is used
 284     # On other platforms (ie. Unix systems), the byte-level API is used
 285
 286     if is_unicode_platform:
 287         return os.listdir(path)
 288     else:
 289         return listdir_unicode_fallback(path)