util.encodingutil: change quote_output to do less unnecessary escaping, and to use...
authordavid-sarah <david-sarah@jacaranda.org>
Fri, 23 Jul 2010 07:53:14 +0000 (00:53 -0700)
committerdavid-sarah <david-sarah@jacaranda.org>
Fri, 23 Jul 2010 07:53:14 +0000 (00:53 -0700)
src/allmydata/test/test_encodingutil.py
src/allmydata/util/encodingutil.py

index 4d2c8ed545633054d61e14988fd10f602bed209c..3810570150f9a0a13031902718530b32032c21e0 100644 (file)
@@ -57,8 +57,8 @@ import os, sys, locale
 
 from allmydata.test.common_util import ReallyEqualMixin
 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
-    unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \
-    get_output_encoding, get_filesystem_encoding, _reload
+    unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
+    FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload
 from allmydata.dirnode import normalize
 
 from twisted.python import usage
@@ -286,6 +286,103 @@ class StdlibUnicode(unittest.TestCase):
             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
 
 
+class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
+    def _check(self, inp, out, enc, optional_quotes):
+        out2 = out
+        if optional_quotes:
+            out2 = out2[1:-1]
+        self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out)
+        self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2)
+        if out[0:2] != 'b"':
+            if isinstance(inp, str):
+                self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out)
+                self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2)
+            else:
+                self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out)
+                self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2)
+
+    def _test_quote_output_all(self, enc):
+        def check(inp, out, optional_quotes=False):
+            self._check(inp, out, enc, optional_quotes)
+
+        # optional single quotes
+        check("foo",  "'foo'",  True)
+        check("\\",   "'\\'",   True)
+        check("$\"`", "'$\"`'", True)
+
+        # mandatory single quotes
+        check("\"",   "'\"'")
+
+        # double quotes
+        check("'",    "\"'\"")
+        check("\n",   "\"\\x0a\"")
+        check("\x00", "\"\\x00\"")
+
+        # invalid Unicode and astral planes
+        check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
+        check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
+        check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
+        check(u"\uD800\uDC00",       "\"\\U00010000\"")
+        check(u"\uD800\uDC01",       "\"\\U00010001\"")
+        check(u"\uD801\uDC00",       "\"\\U00010400\"")
+        check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
+        check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
+        check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
+
+        # invalid UTF-8
+        check("\xFF",                "b\"\\xff\"")
+        check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
+
+    def test_quote_output_ascii(self, enc='ascii'):
+        def check(inp, out, optional_quotes=False):
+            self._check(inp, out, enc, optional_quotes)
+
+        self._test_quote_output_all(enc)
+        check(u"\u00D7",   "\"\\xd7\"")
+        check(u"'\u00D7",  "\"'\\xd7\"")
+        check(u"\"\u00D7", "\"\\\"\\xd7\"")
+        check(u"\u2621",   "\"\\u2621\"")
+        check(u"'\u2621",  "\"'\\u2621\"")
+        check(u"\"\u2621", "\"\\\"\\u2621\"")
+
+    def test_quote_output_latin1(self, enc='latin1'):
+        def check(inp, out, optional_quotes=False):
+            self._check(inp, out.encode('latin1'), enc, optional_quotes)
+
+        self._test_quote_output_all(enc)
+        check(u"\u00D7",   u"'\u00D7'", True)
+        check(u"'\u00D7",  u"\"'\u00D7\"")
+        check(u"\"\u00D7", u"'\"\u00D7'")
+        check(u"\u00D7\"", u"'\u00D7\"'", True)
+        check(u"\u2621",   u"\"\\u2621\"")
+        check(u"'\u2621",  u"\"'\\u2621\"")
+        check(u"\"\u2621", u"\"\\\"\\u2621\"")
+
+    def test_quote_output_utf8(self, enc='utf-8'):
+        def check(inp, out, optional_quotes=False):
+            self._check(inp, out.encode('utf-8'), enc, optional_quotes)
+
+        self._test_quote_output_all(enc)
+        check(u"\u2621",   u"'\u2621'", True)
+        check(u"'\u2621",  u"\"'\u2621\"")
+        check(u"\"\u2621", u"'\"\u2621'")
+        check(u"\u2621\"", u"'\u2621\"'", True)
+
+    @patch('sys.stdout')
+    def test_quote_output_mock(self, mock_stdout):
+        mock_stdout.encoding = 'ascii'
+        _reload()
+        self.test_quote_output_ascii(None)
+
+        mock_stdout.encoding = 'latin1'
+        _reload()
+        self.test_quote_output_latin1(None)
+
+        mock_stdout.encoding = 'utf-8'
+        _reload()
+        self.test_quote_output_utf8(None)
+
+
 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
     output = 'lumi\xc3\xa8re'
index d2a65852c395ed2c6521e4306d2e2130cdfe38eb..cb891a21c99f838cb0d4f884c11661a82dbefb56 100644 (file)
@@ -120,8 +120,8 @@ def to_argv(s):
         return s
     return s.encode(argv_encoding)
 
-PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
-PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
+PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
+PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
 
 def is_printable_ascii(s):
     return PRINTABLE_ASCII.search(s) is not None
@@ -145,12 +145,47 @@ def unicode_to_output(s):
                                  (output_encoding, repr(s)))
     return out
 
+
+def _unicode_escape(m):
+    u = m.group(0)
+    if u == '"' or u == '$' or u == '`' or u == '\\':
+        return u'\\' + u
+    if len(u) == 2:
+        codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
+    else:
+        codepoint = ord(u)
+    if codepoint > 0xFFFF:
+        return u'\\U%08x' % (codepoint,)
+    elif codepoint > 0xFF:
+        return u'\\u%04x' % (codepoint,)
+    else:
+        return u'\\x%02x' % (codepoint,)
+
+def _str_escape(m):
+    c = m.group(0)
+    if c == '"' or c == '$' or c == '`' or c == '\\':
+        return '\\' + c
+    else:
+        return '\\x%02x' % (ord(c),)
+
+MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
+
+# if we must double-quote, then we have to escape ", $ and `, but need not escape '
+ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
+                               ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
+                               re.DOTALL)
+
+ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
+
 def quote_output(s, quotemarks=True, encoding=None):
     """
     Encode either a Unicode string or a UTF-8-encoded bytestring for representation
     on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
-    always surrounded by single quotes; otherwise, it is quoted only if necessary to
-    avoid ambiguity or control bytes in the output.
+    always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
+    control bytes in the output.
+    Quoting may use either single or double quotes. Within single quotes, all
+    characters stand for themselves, and ' will not appear. Within double quotes,
+    Python-compatible backslash escaping is used.
     """
     precondition(isinstance(s, (str, unicode)), s)
 
@@ -158,20 +193,20 @@ def quote_output(s, quotemarks=True, encoding=None):
         try:
             s = s.decode('utf-8')
         except UnicodeDecodeError:
-            return 'b' + repr(s)
-
-    try:
-        out = s.encode(encoding or output_encoding)
-    except (UnicodeEncodeError, UnicodeDecodeError):
-        return repr(s)
+            return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
 
-    if PRINTABLE_8BIT.search(out) is None:
-        return repr(out)
-
-    if quotemarks:
-        return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
-    else:
-        return out
+    if MUST_DOUBLE_QUOTE.search(s) is None:
+        try:
+            out = s.encode(encoding or output_encoding)
+            if quotemarks or out.startswith('"'):
+                return "'%s'" % (out,)
+            else:
+                return out
+        except (UnicodeDecodeError, UnicodeEncodeError):
+            pass
+
+    escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
+    return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
 
 def quote_path(path, quotemarks=True):
     return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)