From 07c26ff805e978d160efa64bfa40b917b915cff9 Mon Sep 17 00:00:00 2001 From: David-Sarah Hopwood Date: Fri, 4 Jan 2013 05:31:07 +0000 Subject: [PATCH] Only quote newline characters where necessary. fixes #1484 Signed-off-by: David-Sarah Hopwood --- src/allmydata/test/test_encodingutil.py | 39 +++++++++++++++---------- src/allmydata/util/encodingutil.py | 32 +++++++++++++------- 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/src/allmydata/test/test_encodingutil.py b/src/allmydata/test/test_encodingutil.py index 6c4b9965..abd3d8cb 100644 --- a/src/allmydata/test/test_encodingutil.py +++ b/src/allmydata/test/test_encodingutil.py @@ -295,36 +295,37 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase): def tearDown(self): _reload() - def _check(self, inp, out, enc, optional_quotes): + def _check(self, inp, out, enc, optional_quotes, quote_newlines): out2 = out if optional_quotes: out2 = out2[1:-1] - self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out) - self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2) + self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out) + self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) if out[0:2] == 'b"': pass elif isinstance(inp, str): - self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out) - self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2) + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out) + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) else: - self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out) - self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2) + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out) + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) def _test_quote_output_all(self, enc): - def check(inp, out, optional_quotes=False): - self._check(inp, out, enc, optional_quotes) + def check(inp, out, optional_quotes=False, quote_newlines=None): + self._check(inp, out, enc, optional_quotes, quote_newlines) # optional single quotes check("foo", "'foo'", True) check("\\", "'\\'", True) check("$\"`", "'$\"`'", True) + check("\n", "'\n'", True, quote_newlines=False) # mandatory single quotes check("\"", "'\"'") # double quotes check("'", "\"'\"") - check("\n", "\"\\x0a\"") + check("\n", "\"\\x0a\"", quote_newlines=True) check("\x00", "\"\\x00\"") # invalid Unicode and astral planes @@ -343,8 +344,8 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase): check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") def test_quote_output_ascii(self, enc='ascii'): - def check(inp, out, optional_quotes=False): - self._check(inp, out, enc, optional_quotes) + def check(inp, out, optional_quotes=False, quote_newlines=None): + self._check(inp, out, enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) check(u"\u00D7", "\"\\xd7\"") @@ -353,10 +354,12 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase): check(u"\u2621", "\"\\u2621\"") check(u"'\u2621", "\"'\\u2621\"") check(u"\"\u2621", "\"\\\"\\u2621\"") + check(u"\n", "'\n'", True, quote_newlines=False) + check(u"\n", "\"\\x0a\"", quote_newlines=True) def test_quote_output_latin1(self, enc='latin1'): - def check(inp, out, optional_quotes=False): - self._check(inp, out.encode('latin1'), enc, optional_quotes) + def check(inp, out, optional_quotes=False, quote_newlines=None): + self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) check(u"\u00D7", u"'\u00D7'", True) @@ -366,16 +369,20 @@ class QuoteOutput(ReallyEqualMixin, unittest.TestCase): check(u"\u2621", u"\"\\u2621\"") check(u"'\u2621", u"\"'\\u2621\"") check(u"\"\u2621", u"\"\\\"\\u2621\"") + check(u"\n", u"'\n'", True, quote_newlines=False) + check(u"\n", u"\"\\x0a\"", quote_newlines=True) def test_quote_output_utf8(self, enc='utf-8'): - def check(inp, out, optional_quotes=False): - self._check(inp, out.encode('utf-8'), enc, optional_quotes) + def check(inp, out, optional_quotes=False, quote_newlines=None): + self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) check(u"\u2621", u"'\u2621'", True) check(u"'\u2621", u"\"'\u2621\"") check(u"\"\u2621", u"'\"\u2621'") check(u"\u2621\"", u"'\u2621\"'", True) + check(u"\n", u"'\n'", True, quote_newlines=False) + check(u"\n", u"\"\\x0a\"", quote_newlines=True) def test_quote_output_default(self): encodingutil.io_encoding = 'ascii' diff --git a/src/allmydata/util/encodingutil.py b/src/allmydata/util/encodingutil.py index 2f3bfeca..3ceb1a91 100644 --- a/src/allmydata/util/encodingutil.py +++ b/src/allmydata/util/encodingutil.py @@ -153,10 +153,12 @@ def unicode_to_output(s): return out -def _unicode_escape(m): +def _unicode_escape(m, quote_newlines): u = m.group(0) - if u == '"' or u == '$' or u == '`' or u == '\\': + if u == u'"' or u == u'$' or u == u'`' or u == u'\\': return u'\\' + u + elif u == u'\n' and not quote_newlines: + return u if len(u) == 2: codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 else: @@ -168,14 +170,17 @@ def _unicode_escape(m): else: return u'\\x%02x' % (codepoint,) -def _str_escape(m): +def _str_escape(m, quote_newlines): c = m.group(0) if c == '"' or c == '$' or c == '`' or c == '\\': return '\\' + c + elif c == '\n' and not quote_newlines: + return c else: return '\\x%02x' % (ord(c),) -MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) +MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) +MUST_DOUBLE_QUOTE = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) # if we must double-quote, then we have to escape ", $ and `, but need not escape ' ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs @@ -184,25 +189,32 @@ ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid su ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) -def quote_output(s, quotemarks=True, encoding=None): +def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None): """ Encode either a Unicode string or a UTF-8-encoded bytestring for representation on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or - control bytes in the output. + control bytes in the output. (Newlines are counted as control bytes iff + quote_newlines is True.) + Quoting may use either single or double quotes. Within single quotes, all characters stand for themselves, and ' will not appear. Within double quotes, Python-compatible backslash escaping is used. + + If not explicitly given, quote_newlines is True when quotemarks is True. """ precondition(isinstance(s, (str, unicode)), s) + if quote_newlines is None: + quote_newlines = quotemarks if isinstance(s, str): try: s = s.decode('utf-8') except UnicodeDecodeError: - return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),) + return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),) - if MUST_DOUBLE_QUOTE.search(s) is None: + must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE + if must_double_quote.search(s) is None: try: out = s.encode(encoding or io_encoding) if quotemarks or out.startswith('"'): @@ -212,11 +224,11 @@ def quote_output(s, quotemarks=True, encoding=None): except (UnicodeDecodeError, UnicodeEncodeError): pass - escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s) + escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s) return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),) def quote_path(path, quotemarks=True): - return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks) + return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True) def unicode_platform(): -- 2.45.2