unicode and back.
"""
-import sys
-import re
+import sys, os, re, locale
+from types import NoneType
+
from allmydata.util.assertutil import precondition
from twisted.python import usage
-import locale
from allmydata.util import log
+from allmydata.util.fileutil import abspath_expanduser_unicode
-def _canonical_encoding(encoding):
+def canonical_encoding(encoding):
if encoding is None:
log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
encoding = 'utf-8'
elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
encoding = 'ascii'
+ return encoding
+
+def check_encoding(encoding):
# sometimes Python returns an encoding name that it doesn't support for conversion
# fail early if this happens
try:
except (LookupError, AttributeError):
raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
- return encoding
-
filesystem_encoding = None
-output_encoding = None
-argv_encoding = None
+io_encoding = None
is_unicode_platform = False
def _reload():
- global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
-
- filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
+ global filesystem_encoding, io_encoding, is_unicode_platform
- outenc = None
- if hasattr(sys.stdout, 'encoding'):
- outenc = sys.stdout.encoding
- if outenc is None:
- try:
- outenc = locale.getpreferredencoding()
- except Exception:
- pass # work around <http://bugs.python.org/issue1443504>
- output_encoding = _canonical_encoding(outenc)
+ filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
+ check_encoding(filesystem_encoding)
if sys.platform == 'win32':
- # Unicode arguments are not supported on Windows yet; see #565 and #1074.
- argv_encoding = 'ascii'
+ # On Windows we install UTF-8 stream wrappers for sys.stdout and
+ # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
+ io_encoding = 'utf-8'
else:
- argv_encoding = output_encoding
+ ioenc = None
+ if hasattr(sys.stdout, 'encoding'):
+ ioenc = sys.stdout.encoding
+ if ioenc is None:
+ try:
+ ioenc = locale.getpreferredencoding()
+ except Exception:
+ pass # work around <http://bugs.python.org/issue1443504>
+ io_encoding = canonical_encoding(ioenc)
+
+ check_encoding(io_encoding)
+
is_unicode_platform = sys.platform in ["win32", "darwin"]
_reload()
"""
return filesystem_encoding
-def get_output_encoding():
+def get_io_encoding():
"""
- Returns expected encoding for writing to stdout or stderr.
+ Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
"""
- return output_encoding
-
-def get_argv_encoding():
- """
- Returns expected encoding for command-line arguments.
- """
- return argv_encoding
+ return io_encoding
def argv_to_unicode(s):
"""
precondition(isinstance(s, str), s)
try:
- return unicode(s, argv_encoding)
+ return unicode(s, io_encoding)
except UnicodeDecodeError:
raise usage.UsageError("Argument %s cannot be decoded as %s." %
- (quote_output(s), argv_encoding))
+ (quote_output(s), io_encoding))
+
+def argv_to_abspath(s, **kwargs):
+ """
+ Convenience function to decode an argv element to an absolute path, with ~ expanded.
+ If this fails, raise a UsageError.
+ """
+ decoded = argv_to_unicode(s)
+ if decoded.startswith(u'-'):
+ raise usage.UsageError("Path argument %s cannot start with '-'.\nUse %s if you intended to refer to a file."
+ % (quote_output(s), quote_output(os.path.join('.', s))))
+ return abspath_expanduser_unicode(decoded, **kwargs)
+
+def unicode_to_argv(s, mangle=False):
+ """
+ Encode the given Unicode argument as a bytestring.
+ If the argument is to be passed to a different process, then the 'mangle' argument
+ should be true; on Windows, this uses a mangled encoding that will be reversed by
+ code in runner.py.
+ """
+ precondition(isinstance(s, unicode), s)
+
+ if mangle and sys.platform == "win32":
+ # This must be the same as 'mangle' in bin/tahoe-script.template.
+ return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
+ else:
+ return s.encode(io_encoding)
def unicode_to_url(s):
"""
return s
return s.encode('utf-8')
-def to_argv(s):
- if isinstance(s, str):
+def from_utf8_or_none(s):
+ precondition(isinstance(s, (NoneType, str)), s)
+ if s is None:
return s
- return s.encode(argv_encoding)
+ return s.decode('utf-8')
-PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
-PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
+PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
+PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
def is_printable_ascii(s):
return PRINTABLE_ASCII.search(s) is not None
precondition(isinstance(s, unicode), s)
try:
- out = s.encode(output_encoding)
+ out = s.encode(io_encoding)
except (UnicodeEncodeError, UnicodeDecodeError):
- raise UnicodeEncodeError(output_encoding, s, 0, 0,
+ raise UnicodeEncodeError(io_encoding, s, 0, 0,
"A string could not be encoded as %s for output to the terminal:\n%r" %
- (output_encoding, repr(s)))
+ (io_encoding, repr(s)))
if PRINTABLE_8BIT.search(out) is None:
- raise UnicodeEncodeError(output_encoding, s, 0, 0,
+ raise UnicodeEncodeError(io_encoding, s, 0, 0,
"A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
- (output_encoding, repr(s)))
+ (io_encoding, repr(s)))
return out
-def quote_output(s, quotemarks=True, encoding=None):
+
+def _unicode_escape(m, quote_newlines):
+ u = m.group(0)
+ if u == u'"' or u == u'$' or u == u'`' or u == u'\\':
+ return u'\\' + u
+ elif u == u'\n' and not quote_newlines:
+ return u
+ if len(u) == 2:
+ codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
+ else:
+ codepoint = ord(u)
+ if codepoint > 0xFFFF:
+ return u'\\U%08x' % (codepoint,)
+ elif codepoint > 0xFF:
+ return u'\\u%04x' % (codepoint,)
+ else:
+ return u'\\x%02x' % (codepoint,)
+
+def _str_escape(m, quote_newlines):
+ c = m.group(0)
+ if c == '"' or c == '$' or c == '`' or c == '\\':
+ return '\\' + c
+ elif c == '\n' and not quote_newlines:
+ return c
+ else:
+ return '\\x%02x' % (ord(c),)
+
+MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
+MUST_DOUBLE_QUOTE = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
+
+# if we must double-quote, then we have to escape ", $ and `, but need not escape '
+ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs
+ ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
+ re.DOTALL)
+
+ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
+
+def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
"""
Encode either a Unicode string or a UTF-8-encoded bytestring for representation
on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
- always surrounded by single quotes; otherwise, it is quoted only if necessary to
- avoid ambiguity or control bytes in the output.
+ always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
+ control bytes in the output. (Newlines are counted as control bytes iff
+ quote_newlines is True.)
+
+ Quoting may use either single or double quotes. Within single quotes, all
+ characters stand for themselves, and ' will not appear. Within double quotes,
+ Python-compatible backslash escaping is used.
+
+ If not explicitly given, quote_newlines is True when quotemarks is True.
"""
precondition(isinstance(s, (str, unicode)), s)
+ if quote_newlines is None:
+ quote_newlines = quotemarks
if isinstance(s, str):
try:
s = s.decode('utf-8')
except UnicodeDecodeError:
- return 'b' + repr(s)
-
- try:
- out = s.encode(encoding or output_encoding)
- except (UnicodeEncodeError, UnicodeDecodeError):
- return repr(s)
+ return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
- if PRINTABLE_8BIT.search(out) is None:
- return repr(out)
+ must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
+ if must_double_quote.search(s) is None:
+ try:
+ out = s.encode(encoding or io_encoding)
+ if quotemarks or out.startswith('"'):
+ return "'%s'" % (out,)
+ else:
+ return out
+ except (UnicodeDecodeError, UnicodeEncodeError):
+ pass
- if quotemarks:
- return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
- else:
- return out
+ escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
+ return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
def quote_path(path, quotemarks=True):
- return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
+ return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
+
+def quote_local_unicode_path(path, quotemarks=True):
+ precondition(isinstance(path, unicode), path)
+
+ if sys.platform == "win32" and path.startswith(u"\\\\?\\"):
+ path = path[4 :]
+ if path.startswith(u"UNC\\"):
+ path = u"\\\\" + path[4 :]
+
+ return quote_output(path, quotemarks=quotemarks, quote_newlines=True)
def unicode_platform():
Does the current platform handle Unicode filenames natively?
"""
return is_unicode_platform
+
+class FilenameEncodingError(Exception):
+ """
+ Filename cannot be encoded using the current encoding of your filesystem
+ (%s). Please configure your locale correctly or rename this file.
+ """
+ pass
+
+def listdir_unicode_fallback(path):
+ """
+ This function emulates a fallback Unicode API similar to one available
+ under Windows or MacOS X.
+
+ If badly encoded filenames are encountered, an exception is raised.
+ """
+ precondition(isinstance(path, unicode), path)
+
+ try:
+ byte_path = path.encode(filesystem_encoding)
+ except (UnicodeEncodeError, UnicodeDecodeError):
+ raise FilenameEncodingError(path)
+
+ try:
+ return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
+ except UnicodeDecodeError:
+ raise FilenameEncodingError(fn)
+
+def listdir_unicode(path):
+ """
+ Wrapper around listdir() which provides safe access to the convenient
+ Unicode API even under platforms that don't provide one natively.
+ """
+ precondition(isinstance(path, unicode), path)
+
+ # On Windows and MacOS X, the Unicode API is used
+ # On other platforms (ie. Unix systems), the byte-level API is used
+
+ if is_unicode_platform:
+ return os.listdir(path)
+ else:
+ return listdir_unicode_fallback(path)