src/allmydata/util/encodingutil.py

   1 """
   2 Functions used to convert inputs from whatever encoding used in the system to
   3 unicode and back.
   4 """
   5
   6 import sys
   7 import os
   8 import re
   9 from allmydata.util.assertutil import precondition
  10 from twisted.python import usage
  11 import locale
  12 from allmydata.util import log
  13 from allmydata.util.fileutil import abspath_expanduser_unicode
  14
  15
  16 def _canonical_encoding(encoding):
  17     if encoding is None:
  18         log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
  19         encoding = 'utf-8'
  20     encoding = encoding.lower()
  21     if encoding == "cp65001":
  22         encoding = 'utf-8'
  23     elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
  24         encoding = 'ascii'
  25
  26     # sometimes Python returns an encoding name that it doesn't support for conversion
  27     # fail early if this happens
  28     try:
  29         u"test".encode(encoding)
  30     except (LookupError, AttributeError):
  31         raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
  32
  33     return encoding
  34
  35 filesystem_encoding = None
  36 output_encoding = None
  37 argv_encoding = None
  38 is_unicode_platform = False
  39
  40 def _reload():
  41     global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
  42
  43     filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
  44
  45     outenc = None
  46     if hasattr(sys.stdout, 'encoding'):
  47         outenc = sys.stdout.encoding
  48     if outenc is None:
  49         try:
  50             outenc = locale.getpreferredencoding()
  51         except Exception:
  52             pass  # work around <http://bugs.python.org/issue1443504>
  53     output_encoding = _canonical_encoding(outenc)
  54
  55     if sys.platform == 'win32':
  56         # Unicode arguments are not supported on Windows yet; see #565 and #1074.
  57         argv_encoding = 'ascii'
  58     else:
  59         argv_encoding = output_encoding
  60     is_unicode_platform = sys.platform in ["win32", "darwin"]
  61
  62 _reload()
  63
  64
  65 def get_filesystem_encoding():
  66     """
  67     Returns expected encoding for local filenames.
  68     """
  69     return filesystem_encoding
  70
  71 def get_output_encoding():
  72     """
  73     Returns expected encoding for writing to stdout or stderr.
  74     """
  75     return output_encoding
  76
  77 def get_argv_encoding():
  78     """
  79     Returns expected encoding for command-line arguments.
  80     """
  81     return argv_encoding
  82
  83 def argv_to_unicode(s):
  84     """
  85     Decode given argv element to unicode. If this fails, raise a UsageError.
  86     """
  87     precondition(isinstance(s, str), s)
  88
  89     try:
  90         return unicode(s, argv_encoding)
  91     except UnicodeDecodeError:
  92         raise usage.UsageError("Argument %s cannot be decoded as %s." %
  93                                (quote_output(s), argv_encoding))
  94
  95 def argv_to_abspath(s):
  96     """
  97     Convenience function to decode an argv element to an absolute path, with ~ expanded.
  98     If this fails, raise a UsageError.
  99     """
 100     return abspath_expanduser_unicode(argv_to_unicode(s))
 101
 102 def unicode_to_argv(s, mangle=False):
 103     """
 104     Encode the given Unicode argument as a bytestring.
 105     If the argument is to be passed to a different process, then the 'mangle' argument
 106     should be true; on Windows, this uses a mangled encoding that will be reversed by
 107     code in runner.py.
 108     """
 109     precondition(isinstance(s, unicode), s)
 110
 111     if mangle and sys.platform == "win32":
 112         # This must be the same as 'mangle' in bin/tahoe-script.template.
 113         return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
 114     else:
 115         return s.encode(argv_encoding)
 116
 117 def unicode_to_url(s):
 118     """
 119     Encode an unicode object used in an URL.
 120     """
 121     # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
 122
 123     # FIXME
 124     return to_str(s)
 125     #precondition(isinstance(s, unicode), s)
 126     #return s.encode('utf-8')
 127
 128 def to_str(s):
 129     if s is None or isinstance(s, str):
 130         return s
 131     return s.encode('utf-8')
 132
 133 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
 134 PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
 135
 136 def is_printable_ascii(s):
 137     return PRINTABLE_ASCII.search(s) is not None
 138
 139 def unicode_to_output(s):
 140     """
 141     Encode an unicode object for representation on stdout or stderr.
 142     """
 143     precondition(isinstance(s, unicode), s)
 144
 145     try:
 146         out = s.encode(output_encoding)
 147     except (UnicodeEncodeError, UnicodeDecodeError):
 148         raise UnicodeEncodeError(output_encoding, s, 0, 0,
 149                                  "A string could not be encoded as %s for output to the terminal:\n%r" %
 150                                  (output_encoding, repr(s)))
 151
 152     if PRINTABLE_8BIT.search(out) is None:
 153         raise UnicodeEncodeError(output_encoding, s, 0, 0,
 154                                  "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
 155                                  (output_encoding, repr(s)))
 156     return out
 157
 158
 159 def _unicode_escape(m):
 160     u = m.group(0)
 161     if u == '"' or u == '$' or u == '`' or u == '\\':
 162         return u'\\' + u
 163     if len(u) == 2:
 164         codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
 165     else:
 166         codepoint = ord(u)
 167     if codepoint > 0xFFFF:
 168         return u'\\U%08x' % (codepoint,)
 169     elif codepoint > 0xFF:
 170         return u'\\u%04x' % (codepoint,)
 171     else:
 172         return u'\\x%02x' % (codepoint,)
 173
 174 def _str_escape(m):
 175     c = m.group(0)
 176     if c == '"' or c == '$' or c == '`' or c == '\\':
 177         return '\\' + c
 178     else:
 179         return '\\x%02x' % (ord(c),)
 180
 181 MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
 182
 183 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
 184 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
 185                                ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
 186                                re.DOTALL)
 187
 188 ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
 189
 190 def quote_output(s, quotemarks=True, encoding=None):
 191     """
 192     Encode either a Unicode string or a UTF-8-encoded bytestring for representation
 193     on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
 194     always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
 195     control bytes in the output.
 196     Quoting may use either single or double quotes. Within single quotes, all
 197     characters stand for themselves, and ' will not appear. Within double quotes,
 198     Python-compatible backslash escaping is used.
 199     """
 200     precondition(isinstance(s, (str, unicode)), s)
 201
 202     if isinstance(s, str):
 203         try:
 204             s = s.decode('utf-8')
 205         except UnicodeDecodeError:
 206             return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
 207
 208     if MUST_DOUBLE_QUOTE.search(s) is None:
 209         try:
 210             out = s.encode(encoding or output_encoding)
 211             if quotemarks or out.startswith('"'):
 212                 return "'%s'" % (out,)
 213             else:
 214                 return out
 215         except (UnicodeDecodeError, UnicodeEncodeError):
 216             pass
 217
 218     escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
 219     return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
 220
 221 def quote_path(path, quotemarks=True):
 222     return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
 223
 224
 225 def unicode_platform():
 226     """
 227     Does the current platform handle Unicode filenames natively?
 228     """
 229     return is_unicode_platform
 230
 231 class FilenameEncodingError(Exception):
 232     """
 233     Filename cannot be encoded using the current encoding of your filesystem
 234     (%s). Please configure your locale correctly or rename this file.
 235     """
 236     pass
 237
 238 def listdir_unicode_fallback(path):
 239     """
 240     This function emulates a fallback Unicode API similar to one available
 241     under Windows or MacOS X.
 242
 243     If badly encoded filenames are encountered, an exception is raised.
 244     """
 245     precondition(isinstance(path, unicode), path)
 246
 247     try:
 248         byte_path = path.encode(filesystem_encoding)
 249     except (UnicodeEncodeError, UnicodeDecodeError):
 250         raise FilenameEncodingError(path)
 251
 252     try:
 253         return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
 254     except UnicodeDecodeError:
 255         raise FilenameEncodingError(fn)
 256
 257 def listdir_unicode(path):
 258     """
 259     Wrapper around listdir() which provides safe access to the convenient
 260     Unicode API even under platforms that don't provide one natively.
 261     """
 262     precondition(isinstance(path, unicode), path)
 263
 264     # On Windows and MacOS X, the Unicode API is used
 265     # On other platforms (ie. Unix systems), the byte-level API is used
 266
 267     if is_unicode_platform:
 268         return os.listdir(path)
 269     else:
 270         return listdir_unicode_fallback(path)