src/allmydata/util/encodingutil.py

   1 """
   2 Functions used to convert inputs from whatever encoding used in the system to
   3 unicode and back.
   4 """
   5
   6 import sys
   7 import os
   8 import re
   9 from allmydata.util.assertutil import precondition
  10 from twisted.python import usage
  11 import locale
  12 from allmydata.util import log
  13 from allmydata.util.fileutil import abspath_expanduser_unicode
  14
  15
  16 def canonical_encoding(encoding):
  17     if encoding is None:
  18         log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
  19         encoding = 'utf-8'
  20     encoding = encoding.lower()
  21     if encoding == "cp65001":
  22         encoding = 'utf-8'
  23     elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
  24         encoding = 'ascii'
  25
  26     return encoding
  27
  28 def check_encoding(encoding):
  29     # sometimes Python returns an encoding name that it doesn't support for conversion
  30     # fail early if this happens
  31     try:
  32         u"test".encode(encoding)
  33     except (LookupError, AttributeError):
  34         raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
  35
  36 filesystem_encoding = None
  37 output_encoding = None
  38 argv_encoding = None
  39 is_unicode_platform = False
  40
  41 def _reload():
  42     global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
  43
  44     filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
  45     check_encoding(filesystem_encoding)
  46
  47     if sys.platform == 'win32':
  48         # On Windows we install UTF-8 stream wrappers for sys.stdout and
  49         # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
  50         output_encoding = 'utf-8'
  51     else:
  52         outenc = None
  53         if hasattr(sys.stdout, 'encoding'):
  54             outenc = sys.stdout.encoding
  55         if outenc is None:
  56             try:
  57                 outenc = locale.getpreferredencoding()
  58             except Exception:
  59                 pass  # work around <http://bugs.python.org/issue1443504>
  60         output_encoding = canonical_encoding(outenc)
  61
  62     check_encoding(output_encoding)
  63     argv_encoding = output_encoding
  64
  65     is_unicode_platform = sys.platform in ["win32", "darwin"]
  66
  67 _reload()
  68
  69
  70 def get_filesystem_encoding():
  71     """
  72     Returns expected encoding for local filenames.
  73     """
  74     return filesystem_encoding
  75
  76 def get_output_encoding():
  77     """
  78     Returns expected encoding for writing to stdout or stderr.
  79     """
  80     return output_encoding
  81
  82 def get_argv_encoding():
  83     """
  84     Returns expected encoding for command-line arguments.
  85     """
  86     return argv_encoding
  87
  88 def argv_to_unicode(s):
  89     """
  90     Decode given argv element to unicode. If this fails, raise a UsageError.
  91     """
  92     precondition(isinstance(s, str), s)
  93
  94     try:
  95         return unicode(s, argv_encoding)
  96     except UnicodeDecodeError:
  97         raise usage.UsageError("Argument %s cannot be decoded as %s." %
  98                                (quote_output(s), argv_encoding))
  99
 100 def argv_to_abspath(s):
 101     """
 102     Convenience function to decode an argv element to an absolute path, with ~ expanded.
 103     If this fails, raise a UsageError.
 104     """
 105     return abspath_expanduser_unicode(argv_to_unicode(s))
 106
 107 def unicode_to_argv(s, mangle=False):
 108     """
 109     Encode the given Unicode argument as a bytestring.
 110     If the argument is to be passed to a different process, then the 'mangle' argument
 111     should be true; on Windows, this uses a mangled encoding that will be reversed by
 112     code in runner.py.
 113     """
 114     precondition(isinstance(s, unicode), s)
 115
 116     if mangle and sys.platform == "win32":
 117         # This must be the same as 'mangle' in bin/tahoe-script.template.
 118         return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
 119     else:
 120         return s.encode(argv_encoding)
 121
 122 def unicode_to_url(s):
 123     """
 124     Encode an unicode object used in an URL.
 125     """
 126     # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
 127
 128     # FIXME
 129     return to_str(s)
 130     #precondition(isinstance(s, unicode), s)
 131     #return s.encode('utf-8')
 132
 133 def to_str(s):
 134     if s is None or isinstance(s, str):
 135         return s
 136     return s.encode('utf-8')
 137
 138 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
 139 PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
 140
 141 def is_printable_ascii(s):
 142     return PRINTABLE_ASCII.search(s) is not None
 143
 144 def unicode_to_output(s):
 145     """
 146     Encode an unicode object for representation on stdout or stderr.
 147     """
 148     precondition(isinstance(s, unicode), s)
 149
 150     try:
 151         out = s.encode(output_encoding)
 152     except (UnicodeEncodeError, UnicodeDecodeError):
 153         raise UnicodeEncodeError(output_encoding, s, 0, 0,
 154                                  "A string could not be encoded as %s for output to the terminal:\n%r" %
 155                                  (output_encoding, repr(s)))
 156
 157     if PRINTABLE_8BIT.search(out) is None:
 158         raise UnicodeEncodeError(output_encoding, s, 0, 0,
 159                                  "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
 160                                  (output_encoding, repr(s)))
 161     return out
 162
 163
 164 def _unicode_escape(m):
 165     u = m.group(0)
 166     if u == '"' or u == '$' or u == '`' or u == '\\':
 167         return u'\\' + u
 168     if len(u) == 2:
 169         codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
 170     else:
 171         codepoint = ord(u)
 172     if codepoint > 0xFFFF:
 173         return u'\\U%08x' % (codepoint,)
 174     elif codepoint > 0xFF:
 175         return u'\\u%04x' % (codepoint,)
 176     else:
 177         return u'\\x%02x' % (codepoint,)
 178
 179 def _str_escape(m):
 180     c = m.group(0)
 181     if c == '"' or c == '$' or c == '`' or c == '\\':
 182         return '\\' + c
 183     else:
 184         return '\\x%02x' % (ord(c),)
 185
 186 MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
 187
 188 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
 189 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
 190                                ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
 191                                re.DOTALL)
 192
 193 ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
 194
 195 def quote_output(s, quotemarks=True, encoding=None):
 196     """
 197     Encode either a Unicode string or a UTF-8-encoded bytestring for representation
 198     on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
 199     always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
 200     control bytes in the output.
 201     Quoting may use either single or double quotes. Within single quotes, all
 202     characters stand for themselves, and ' will not appear. Within double quotes,
 203     Python-compatible backslash escaping is used.
 204     """
 205     precondition(isinstance(s, (str, unicode)), s)
 206
 207     if isinstance(s, str):
 208         try:
 209             s = s.decode('utf-8')
 210         except UnicodeDecodeError:
 211             return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
 212
 213     if MUST_DOUBLE_QUOTE.search(s) is None:
 214         try:
 215             out = s.encode(encoding or output_encoding)
 216             if quotemarks or out.startswith('"'):
 217                 return "'%s'" % (out,)
 218             else:
 219                 return out
 220         except (UnicodeDecodeError, UnicodeEncodeError):
 221             pass
 222
 223     escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
 224     return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
 225
 226 def quote_path(path, quotemarks=True):
 227     return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
 228
 229
 230 def unicode_platform():
 231     """
 232     Does the current platform handle Unicode filenames natively?
 233     """
 234     return is_unicode_platform
 235
 236 class FilenameEncodingError(Exception):
 237     """
 238     Filename cannot be encoded using the current encoding of your filesystem
 239     (%s). Please configure your locale correctly or rename this file.
 240     """
 241     pass
 242
 243 def listdir_unicode_fallback(path):
 244     """
 245     This function emulates a fallback Unicode API similar to one available
 246     under Windows or MacOS X.
 247
 248     If badly encoded filenames are encountered, an exception is raised.
 249     """
 250     precondition(isinstance(path, unicode), path)
 251
 252     try:
 253         byte_path = path.encode(filesystem_encoding)
 254     except (UnicodeEncodeError, UnicodeDecodeError):
 255         raise FilenameEncodingError(path)
 256
 257     try:
 258         return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
 259     except UnicodeDecodeError:
 260         raise FilenameEncodingError(fn)
 261
 262 def listdir_unicode(path):
 263     """
 264     Wrapper around listdir() which provides safe access to the convenient
 265     Unicode API even under platforms that don't provide one natively.
 266     """
 267     precondition(isinstance(path, unicode), path)
 268
 269     # On Windows and MacOS X, the Unicode API is used
 270     # On other platforms (ie. Unix systems), the byte-level API is used
 271
 272     if is_unicode_platform:
 273         return os.listdir(path)
 274     else:
 275         return listdir_unicode_fallback(path)