src/allmydata/util/encodingutil.py

   1 """
   2 Functions used to convert inputs from whatever encoding used in the system to
   3 unicode and back.
   4 """
   5
   6 import sys, os, re, locale
   7 from types import NoneType
   8
   9 from allmydata.util.assertutil import precondition, _assert
  10 from twisted.python import usage
  11 from twisted.python.filepath import FilePath
  12 from allmydata.util import log
  13 from allmydata.util.fileutil import abspath_expanduser_unicode
  14
  15
  16 def canonical_encoding(encoding):
  17     if encoding is None:
  18         log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
  19         encoding = 'utf-8'
  20     encoding = encoding.lower()
  21     if encoding == "cp65001":
  22         encoding = 'utf-8'
  23     elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
  24         encoding = 'ascii'
  25
  26     return encoding
  27
  28 def check_encoding(encoding):
  29     # sometimes Python returns an encoding name that it doesn't support for conversion
  30     # fail early if this happens
  31     try:
  32         u"test".encode(encoding)
  33     except (LookupError, AttributeError):
  34         raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
  35
  36 filesystem_encoding = None
  37 io_encoding = None
  38 is_unicode_platform = False
  39 use_unicode_filepath = False
  40
  41 def _reload():
  42     global filesystem_encoding, io_encoding, is_unicode_platform, use_unicode_filepath
  43
  44     filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
  45     check_encoding(filesystem_encoding)
  46
  47     if sys.platform == 'win32':
  48         # On Windows we install UTF-8 stream wrappers for sys.stdout and
  49         # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
  50         io_encoding = 'utf-8'
  51     else:
  52         ioenc = None
  53         if hasattr(sys.stdout, 'encoding'):
  54             ioenc = sys.stdout.encoding
  55         if ioenc is None:
  56             try:
  57                 ioenc = locale.getpreferredencoding()
  58             except Exception:
  59                 pass  # work around <http://bugs.python.org/issue1443504>
  60         io_encoding = canonical_encoding(ioenc)
  61
  62     check_encoding(io_encoding)
  63
  64     is_unicode_platform = sys.platform in ["win32", "darwin"]
  65
  66     # Despite the Unicode-mode FilePath support added to Twisted in
  67     # <https://twistedmatrix.com/trac/ticket/7805>, we can't yet use
  68     # Unicode-mode FilePaths with INotify on non-Windows platforms
  69     # due to <https://twistedmatrix.com/trac/ticket/7928>.
  70     use_unicode_filepath = sys.platform == "win32"
  71
  72 _reload()
  73
  74
  75 def get_filesystem_encoding():
  76     """
  77     Returns expected encoding for local filenames.
  78     """
  79     return filesystem_encoding
  80
  81 def get_io_encoding():
  82     """
  83     Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
  84     """
  85     return io_encoding
  86
  87 def argv_to_unicode(s):
  88     """
  89     Decode given argv element to unicode. If this fails, raise a UsageError.
  90     """
  91     precondition(isinstance(s, str), s)
  92
  93     try:
  94         return unicode(s, io_encoding)
  95     except UnicodeDecodeError:
  96         raise usage.UsageError("Argument %s cannot be decoded as %s." %
  97                                (quote_output(s), io_encoding))
  98
  99 def argv_to_abspath(s):
 100     """
 101     Convenience function to decode an argv element to an absolute path, with ~ expanded.
 102     If this fails, raise a UsageError.
 103     """
 104     return abspath_expanduser_unicode(argv_to_unicode(s))
 105
 106 def unicode_to_argv(s, mangle=False):
 107     """
 108     Encode the given Unicode argument as a bytestring.
 109     If the argument is to be passed to a different process, then the 'mangle' argument
 110     should be true; on Windows, this uses a mangled encoding that will be reversed by
 111     code in runner.py.
 112     """
 113     precondition(isinstance(s, unicode), s)
 114
 115     if mangle and sys.platform == "win32":
 116         # This must be the same as 'mangle' in bin/tahoe-script.template.
 117         return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
 118     else:
 119         return s.encode(io_encoding)
 120
 121 def unicode_to_url(s):
 122     """
 123     Encode an unicode object used in an URL.
 124     """
 125     # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
 126
 127     # FIXME
 128     return to_str(s)
 129     #precondition(isinstance(s, unicode), s)
 130     #return s.encode('utf-8')
 131
 132 def to_str(s):
 133     if s is None or isinstance(s, str):
 134         return s
 135     return s.encode('utf-8')
 136
 137 def from_utf8_or_none(s):
 138     precondition(isinstance(s, (NoneType, str)), s)
 139     if s is None:
 140         return s
 141     return s.decode('utf-8')
 142
 143 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
 144 PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
 145
 146 def is_printable_ascii(s):
 147     return PRINTABLE_ASCII.search(s) is not None
 148
 149 def unicode_to_output(s):
 150     """
 151     Encode an unicode object for representation on stdout or stderr.
 152     """
 153     precondition(isinstance(s, unicode), s)
 154
 155     try:
 156         out = s.encode(io_encoding)
 157     except (UnicodeEncodeError, UnicodeDecodeError):
 158         raise UnicodeEncodeError(io_encoding, s, 0, 0,
 159                                  "A string could not be encoded as %s for output to the terminal:\n%r" %
 160                                  (io_encoding, repr(s)))
 161
 162     if PRINTABLE_8BIT.search(out) is None:
 163         raise UnicodeEncodeError(io_encoding, s, 0, 0,
 164                                  "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
 165                                  (io_encoding, repr(s)))
 166     return out
 167
 168
 169 def _unicode_escape(m, quote_newlines):
 170     u = m.group(0)
 171     if u == u'"' or u == u'$' or u == u'`' or u == u'\\':
 172         return u'\\' + u
 173     elif u == u'\n' and not quote_newlines:
 174         return u
 175     if len(u) == 2:
 176         codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
 177     else:
 178         codepoint = ord(u)
 179     if codepoint > 0xFFFF:
 180         return u'\\U%08x' % (codepoint,)
 181     elif codepoint > 0xFF:
 182         return u'\\u%04x' % (codepoint,)
 183     else:
 184         return u'\\x%02x' % (codepoint,)
 185
 186 def _str_escape(m, quote_newlines):
 187     c = m.group(0)
 188     if c == '"' or c == '$' or c == '`' or c == '\\':
 189         return '\\' + c
 190     elif c == '\n' and not quote_newlines:
 191         return c
 192     else:
 193         return '\\x%02x' % (ord(c),)
 194
 195 MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
 196 MUST_DOUBLE_QUOTE    = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
 197
 198 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
 199 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
 200                                ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
 201                                re.DOTALL)
 202
 203 ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
 204
 205 def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
 206     """
 207     Encode either a Unicode string or a UTF-8-encoded bytestring for representation
 208     on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
 209     always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
 210     control bytes in the output. (Newlines are counted as control bytes iff
 211     quote_newlines is True.)
 212
 213     Quoting may use either single or double quotes. Within single quotes, all
 214     characters stand for themselves, and ' will not appear. Within double quotes,
 215     Python-compatible backslash escaping is used.
 216
 217     If not explicitly given, quote_newlines is True when quotemarks is True.
 218     """
 219     precondition(isinstance(s, (str, unicode)), s)
 220     if quote_newlines is None:
 221         quote_newlines = quotemarks
 222
 223     if isinstance(s, str):
 224         try:
 225             s = s.decode('utf-8')
 226         except UnicodeDecodeError:
 227             return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
 228
 229     must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
 230     if must_double_quote.search(s) is None:
 231         try:
 232             out = s.encode(encoding or io_encoding)
 233             if quotemarks or out.startswith('"'):
 234                 return "'%s'" % (out,)
 235             else:
 236                 return out
 237         except (UnicodeDecodeError, UnicodeEncodeError):
 238             pass
 239
 240     escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
 241     return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
 242
 243 def quote_path(path, quotemarks=True):
 244     return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
 245
 246 def quote_local_unicode_path(path, quotemarks=True):
 247     precondition(isinstance(path, unicode), path)
 248
 249     if sys.platform == "win32" and path.startswith(u"\\\\?\\"):
 250         path = path[4 :]
 251         if path.startswith(u"UNC\\"):
 252             path = u"\\\\" + path[4 :]
 253
 254     return quote_output(path, quotemarks=quotemarks, quote_newlines=True)
 255
 256 def quote_filepath(path, quotemarks=True):
 257     return quote_local_unicode_path(unicode_from_filepath(path), quotemarks=quotemarks)
 258
 259 def extend_filepath(fp, segments):
 260     # We cannot use FilePath.preauthChild, because
 261     # * it has the security flaw described in <https://twistedmatrix.com/trac/ticket/6527>;
 262     # * it may return a FilePath in the wrong mode.
 263
 264     for segment in segments:
 265         fp = fp.child(segment)
 266
 267     if isinstance(fp.path, unicode) and not use_unicode_filepath:
 268         return FilePath(fp.path.encode(filesystem_encoding))
 269     else:
 270         return fp
 271
 272 def to_filepath(path):
 273     precondition(isinstance(path, basestring), path=path)
 274
 275     if isinstance(path, unicode) and not use_unicode_filepath:
 276         path = path.encode(filesystem_encoding)
 277
 278     return FilePath(path)
 279
 280 def _decode(s):
 281     precondition(isinstance(s, basestring), s=s)
 282
 283     if isinstance(s, bytes):
 284         return s.decode(filesystem_encoding)
 285     else:
 286         return s
 287
 288 def unicode_from_filepath(fp):
 289     precondition(isinstance(fp, FilePath), fp=fp)
 290     return _decode(fp.path)
 291
 292 def unicode_segments_from(base_fp, ancestor_fp):
 293     precondition(isinstance(base_fp, FilePath), base_fp=base_fp)
 294     precondition(isinstance(ancestor_fp, FilePath), ancestor_fp=ancestor_fp)
 295
 296     if hasattr(FilePath, 'asTextMode'):
 297         return base_fp.asTextMode().segmentsFrom(ancestor_fp.asTextMode())
 298     else:
 299         bpt, apt = (type(base_fp.path), type(ancestor_fp.path))
 300         _assert(bpt == apt, bpt=bpt, apt=apt)
 301         return map(_decode, base_fp.segmentsFrom(ancestor_fp))
 302
 303 def unicode_platform():
 304     """
 305     Does the current platform handle Unicode filenames natively?
 306     """
 307     return is_unicode_platform
 308
 309 class FilenameEncodingError(Exception):
 310     """
 311     Filename cannot be encoded using the current encoding of your filesystem
 312     (%s). Please configure your locale correctly or rename this file.
 313     """
 314     pass
 315
 316 def listdir_unicode_fallback(path):
 317     """
 318     This function emulates a fallback Unicode API similar to one available
 319     under Windows or MacOS X.
 320
 321     If badly encoded filenames are encountered, an exception is raised.
 322     """
 323     precondition(isinstance(path, unicode), path)
 324
 325     try:
 326         byte_path = path.encode(filesystem_encoding)
 327     except (UnicodeEncodeError, UnicodeDecodeError):
 328         raise FilenameEncodingError(path)
 329
 330     try:
 331         return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
 332     except UnicodeDecodeError:
 333         raise FilenameEncodingError(fn)
 334
 335 def listdir_unicode(path):
 336     """
 337     Wrapper around listdir() which provides safe access to the convenient
 338     Unicode API even under platforms that don't provide one natively.
 339     """
 340     precondition(isinstance(path, unicode), path)
 341
 342     # On Windows and MacOS X, the Unicode API is used
 343     # On other platforms (ie. Unix systems), the byte-level API is used
 344
 345     if is_unicode_platform:
 346         return os.listdir(path)
 347     else:
 348         return listdir_unicode_fallback(path)
 349
 350 def listdir_filepath(fp):
 351     return listdir_unicode(unicode_from_filepath(fp))