src/allmydata/util/encodingutil.py

   1 """
   2 Functions used to convert inputs from whatever encoding used in the system to
   3 unicode and back.
   4 """
   5
   6 import sys
   7 import os
   8 import re
   9 from allmydata.util.assertutil import precondition
  10 from twisted.python import usage
  11 import locale
  12 from allmydata.util import log
  13 from allmydata.util.fileutil import abspath_expanduser_unicode
  14
  15
  16 def canonical_encoding(encoding):
  17     if encoding is None:
  18         log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
  19         encoding = 'utf-8'
  20     encoding = encoding.lower()
  21     if encoding == "cp65001":
  22         encoding = 'utf-8'
  23     elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
  24         encoding = 'ascii'
  25
  26     return encoding
  27
  28 def check_encoding(encoding):
  29     # sometimes Python returns an encoding name that it doesn't support for conversion
  30     # fail early if this happens
  31     try:
  32         u"test".encode(encoding)
  33     except (LookupError, AttributeError):
  34         raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
  35
  36 filesystem_encoding = None
  37 io_encoding = None
  38 is_unicode_platform = False
  39
  40 def _reload():
  41     global filesystem_encoding, io_encoding, is_unicode_platform
  42
  43     filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
  44     check_encoding(filesystem_encoding)
  45
  46     if sys.platform == 'win32':
  47         # On Windows we install UTF-8 stream wrappers for sys.stdout and
  48         # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
  49         io_encoding = 'utf-8'
  50     else:
  51         ioenc = None
  52         if hasattr(sys.stdout, 'encoding'):
  53             ioenc = sys.stdout.encoding
  54         if ioenc is None:
  55             try:
  56                 ioenc = locale.getpreferredencoding()
  57             except Exception:
  58                 pass  # work around <http://bugs.python.org/issue1443504>
  59         io_encoding = canonical_encoding(ioenc)
  60
  61     check_encoding(io_encoding)
  62
  63     is_unicode_platform = sys.platform in ["win32", "darwin"]
  64
  65 _reload()
  66
  67
  68 def get_filesystem_encoding():
  69     """
  70     Returns expected encoding for local filenames.
  71     """
  72     return filesystem_encoding
  73
  74 def get_io_encoding():
  75     """
  76     Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
  77     """
  78     return io_encoding
  79
  80 def argv_to_unicode(s):
  81     """
  82     Decode given argv element to unicode. If this fails, raise a UsageError.
  83     """
  84     precondition(isinstance(s, str), s)
  85
  86     try:
  87         return unicode(s, io_encoding)
  88     except UnicodeDecodeError:
  89         raise usage.UsageError("Argument %s cannot be decoded as %s." %
  90                                (quote_output(s), io_encoding))
  91
  92 def argv_to_abspath(s):
  93     """
  94     Convenience function to decode an argv element to an absolute path, with ~ expanded.
  95     If this fails, raise a UsageError.
  96     """
  97     return abspath_expanduser_unicode(argv_to_unicode(s))
  98
  99 def unicode_to_argv(s, mangle=False):
 100     """
 101     Encode the given Unicode argument as a bytestring.
 102     If the argument is to be passed to a different process, then the 'mangle' argument
 103     should be true; on Windows, this uses a mangled encoding that will be reversed by
 104     code in runner.py.
 105     """
 106     precondition(isinstance(s, unicode), s)
 107
 108     if mangle and sys.platform == "win32":
 109         # This must be the same as 'mangle' in bin/tahoe-script.template.
 110         return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
 111     else:
 112         return s.encode(io_encoding)
 113
 114 def unicode_to_url(s):
 115     """
 116     Encode an unicode object used in an URL.
 117     """
 118     # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
 119
 120     # FIXME
 121     return to_str(s)
 122     #precondition(isinstance(s, unicode), s)
 123     #return s.encode('utf-8')
 124
 125 def to_str(s):
 126     if s is None or isinstance(s, str):
 127         return s
 128     return s.encode('utf-8')
 129
 130 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
 131 PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
 132
 133 def is_printable_ascii(s):
 134     return PRINTABLE_ASCII.search(s) is not None
 135
 136 def unicode_to_output(s):
 137     """
 138     Encode an unicode object for representation on stdout or stderr.
 139     """
 140     precondition(isinstance(s, unicode), s)
 141
 142     try:
 143         out = s.encode(io_encoding)
 144     except (UnicodeEncodeError, UnicodeDecodeError):
 145         raise UnicodeEncodeError(io_encoding, s, 0, 0,
 146                                  "A string could not be encoded as %s for output to the terminal:\n%r" %
 147                                  (io_encoding, repr(s)))
 148
 149     if PRINTABLE_8BIT.search(out) is None:
 150         raise UnicodeEncodeError(io_encoding, s, 0, 0,
 151                                  "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
 152                                  (io_encoding, repr(s)))
 153     return out
 154
 155
 156 def _unicode_escape(m):
 157     u = m.group(0)
 158     if u == '"' or u == '$' or u == '`' or u == '\\':
 159         return u'\\' + u
 160     if len(u) == 2:
 161         codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
 162     else:
 163         codepoint = ord(u)
 164     if codepoint > 0xFFFF:
 165         return u'\\U%08x' % (codepoint,)
 166     elif codepoint > 0xFF:
 167         return u'\\u%04x' % (codepoint,)
 168     else:
 169         return u'\\x%02x' % (codepoint,)
 170
 171 def _str_escape(m):
 172     c = m.group(0)
 173     if c == '"' or c == '$' or c == '`' or c == '\\':
 174         return '\\' + c
 175     else:
 176         return '\\x%02x' % (ord(c),)
 177
 178 MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
 179
 180 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
 181 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
 182                                ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
 183                                re.DOTALL)
 184
 185 ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
 186
 187 def quote_output(s, quotemarks=True, encoding=None):
 188     """
 189     Encode either a Unicode string or a UTF-8-encoded bytestring for representation
 190     on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
 191     always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
 192     control bytes in the output.
 193     Quoting may use either single or double quotes. Within single quotes, all
 194     characters stand for themselves, and ' will not appear. Within double quotes,
 195     Python-compatible backslash escaping is used.
 196     """
 197     precondition(isinstance(s, (str, unicode)), s)
 198
 199     if isinstance(s, str):
 200         try:
 201             s = s.decode('utf-8')
 202         except UnicodeDecodeError:
 203             return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
 204
 205     if MUST_DOUBLE_QUOTE.search(s) is None:
 206         try:
 207             out = s.encode(encoding or io_encoding)
 208             if quotemarks or out.startswith('"'):
 209                 return "'%s'" % (out,)
 210             else:
 211                 return out
 212         except (UnicodeDecodeError, UnicodeEncodeError):
 213             pass
 214
 215     escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
 216     return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
 217
 218 def quote_path(path, quotemarks=True):
 219     return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
 220
 221
 222 def unicode_platform():
 223     """
 224     Does the current platform handle Unicode filenames natively?
 225     """
 226     return is_unicode_platform
 227
 228 class FilenameEncodingError(Exception):
 229     """
 230     Filename cannot be encoded using the current encoding of your filesystem
 231     (%s). Please configure your locale correctly or rename this file.
 232     """
 233     pass
 234
 235 def listdir_unicode_fallback(path):
 236     """
 237     This function emulates a fallback Unicode API similar to one available
 238     under Windows or MacOS X.
 239
 240     If badly encoded filenames are encountered, an exception is raised.
 241     """
 242     precondition(isinstance(path, unicode), path)
 243
 244     try:
 245         byte_path = path.encode(filesystem_encoding)
 246     except (UnicodeEncodeError, UnicodeDecodeError):
 247         raise FilenameEncodingError(path)
 248
 249     try:
 250         return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
 251     except UnicodeDecodeError:
 252         raise FilenameEncodingError(fn)
 253
 254 def listdir_unicode(path):
 255     """
 256     Wrapper around listdir() which provides safe access to the convenient
 257     Unicode API even under platforms that don't provide one natively.
 258     """
 259     precondition(isinstance(path, unicode), path)
 260
 261     # On Windows and MacOS X, the Unicode API is used
 262     # On other platforms (ie. Unix systems), the byte-level API is used
 263
 264     if is_unicode_platform:
 265         return os.listdir(path)
 266     else:
 267         return listdir_unicode_fallback(path)