Only quote newline characters where necessary. fixes #1484

[tahoe-lafs/tahoe-lafs.git] / src / allmydata / util / encodingutil.py
diff --git a/src/allmydata/util/encodingutil.py b/src/allmydata/util/encodingutil.py

index 5079184a9158dc79f471540e48328c3c47dc4058..3ceb1a919c7e72dda2d2d820c0b50dd16f8c65a6 100644 (file)
--- a/src/allmydata/util/encodingutil.py
+++ b/src/allmydata/util/encodingutil.py
@@ -13,7 +13,7 @@ from allmydata.util import log
  from allmydata.util.fileutil import abspath_expanduser_unicode
  
  
-def _canonical_encoding(encoding):
+def canonical_encoding(encoding):
      if encoding is None:
          log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
          encoding = 'utf-8'
@@ -23,6 +23,9 @@ def _canonical_encoding(encoding):
      elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
          encoding = 'ascii'
  
+    return encoding
+
+def check_encoding(encoding):
      # sometimes Python returns an encoding name that it doesn't support for conversion
      # fail early if this happens
      try:
@@ -30,33 +33,33 @@ def _canonical_encoding(encoding):
      except (LookupError, AttributeError):
          raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
  
-    return encoding
-
  filesystem_encoding = None
-output_encoding = None
-argv_encoding = None
+io_encoding = None
  is_unicode_platform = False
  
  def _reload():
-    global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
-
-    filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
+    global filesystem_encoding, io_encoding, is_unicode_platform
  
-    outenc = None
-    if hasattr(sys.stdout, 'encoding'):
-        outenc = sys.stdout.encoding
-    if outenc is None:
-        try:
-            outenc = locale.getpreferredencoding()
-        except Exception:
-            pass  # work around <http://bugs.python.org/issue1443504>
-    output_encoding = _canonical_encoding(outenc)
+    filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
+    check_encoding(filesystem_encoding)
  
      if sys.platform == 'win32':
-        # Unicode arguments are not supported on Windows yet; see #565 and #1074.
-        argv_encoding = 'ascii'
+        # On Windows we install UTF-8 stream wrappers for sys.stdout and
+        # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
+        io_encoding = 'utf-8'
      else:
-        argv_encoding = output_encoding
+        ioenc = None
+        if hasattr(sys.stdout, 'encoding'):
+            ioenc = sys.stdout.encoding
+        if ioenc is None:
+            try:
+                ioenc = locale.getpreferredencoding()
+            except Exception:
+                pass  # work around <http://bugs.python.org/issue1443504>
+        io_encoding = canonical_encoding(ioenc)
+
+    check_encoding(io_encoding)
+
      is_unicode_platform = sys.platform in ["win32", "darwin"]
  
  _reload()
@@ -68,17 +71,11 @@ def get_filesystem_encoding():
      """
      return filesystem_encoding
  
-def get_output_encoding():
-    """
-    Returns expected encoding for writing to stdout or stderr.
-    """
-    return output_encoding
-
-def get_argv_encoding():
+def get_io_encoding():
      """
-    Returns expected encoding for command-line arguments.
+    Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
      """
-    return argv_encoding
+    return io_encoding
  
  def argv_to_unicode(s):
      """
@@ -87,10 +84,10 @@ def argv_to_unicode(s):
      precondition(isinstance(s, str), s)
  
      try:
-        return unicode(s, argv_encoding)
+        return unicode(s, io_encoding)
      except UnicodeDecodeError:
          raise usage.UsageError("Argument %s cannot be decoded as %s." %
-                               (quote_output(s), argv_encoding))
+                               (quote_output(s), io_encoding))
  
  def argv_to_abspath(s):
      """
@@ -112,7 +109,7 @@ def unicode_to_argv(s, mangle=False):
          # This must be the same as 'mangle' in bin/tahoe-script.template.
          return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
      else:
-        return s.encode(argv_encoding)
+        return s.encode(io_encoding)
  
  def unicode_to_url(s):
      """
@@ -143,23 +140,25 @@ def unicode_to_output(s):
      precondition(isinstance(s, unicode), s)
  
      try:
-        out = s.encode(output_encoding)
+        out = s.encode(io_encoding)
      except (UnicodeEncodeError, UnicodeDecodeError):
-        raise UnicodeEncodeError(output_encoding, s, 0, 0,
+        raise UnicodeEncodeError(io_encoding, s, 0, 0,
                                   "A string could not be encoded as %s for output to the terminal:\n%r" %
-                                 (output_encoding, repr(s)))
+                                 (io_encoding, repr(s)))
  
      if PRINTABLE_8BIT.search(out) is None:
-        raise UnicodeEncodeError(output_encoding, s, 0, 0,
+        raise UnicodeEncodeError(io_encoding, s, 0, 0,
                                   "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
-                                 (output_encoding, repr(s)))
+                                 (io_encoding, repr(s)))
      return out
  
  
-def _unicode_escape(m):
+def _unicode_escape(m, quote_newlines):
      u = m.group(0)
-    if u == '"' or u == '$' or u == '`' or u == '\\':
+    if u == u'"' or u == u'$' or u == u'`' or u == u'\\':
          return u'\\' + u
+    elif u == u'\n' and not quote_newlines:
+        return u
      if len(u) == 2:
          codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
      else:
@@ -171,14 +170,17 @@ def _unicode_escape(m):
      else:
          return u'\\x%02x' % (codepoint,)
  
-def _str_escape(m):
+def _str_escape(m, quote_newlines):
      c = m.group(0)
      if c == '"' or c == '$' or c == '`' or c == '\\':
          return '\\' + c
+    elif c == '\n' and not quote_newlines:
+        return c
      else:
          return '\\x%02x' % (ord(c),)
  
-MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
+MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
+MUST_DOUBLE_QUOTE    = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
  
  # if we must double-quote, then we have to escape ", $ and `, but need not escape '
  ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
@@ -187,27 +189,34 @@ ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid su
  
  ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
  
-def quote_output(s, quotemarks=True, encoding=None):
+def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
      """
      Encode either a Unicode string or a UTF-8-encoded bytestring for representation
      on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
      always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
-    control bytes in the output.
+    control bytes in the output. (Newlines are counted as control bytes iff
+    quote_newlines is True.)
+
      Quoting may use either single or double quotes. Within single quotes, all
      characters stand for themselves, and ' will not appear. Within double quotes,
      Python-compatible backslash escaping is used.
+
+    If not explicitly given, quote_newlines is True when quotemarks is True.
      """
      precondition(isinstance(s, (str, unicode)), s)
+    if quote_newlines is None:
+        quote_newlines = quotemarks
  
      if isinstance(s, str):
          try:
              s = s.decode('utf-8')
          except UnicodeDecodeError:
-            return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
+            return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
  
-    if MUST_DOUBLE_QUOTE.search(s) is None:
+    must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
+    if must_double_quote.search(s) is None:
          try:
-            out = s.encode(encoding or output_encoding)
+            out = s.encode(encoding or io_encoding)
              if quotemarks or out.startswith('"'):
                  return "'%s'" % (out,)
              else:
@@ -215,11 +224,11 @@ def quote_output(s, quotemarks=True, encoding=None):
          except (UnicodeDecodeError, UnicodeEncodeError):
              pass
  
-    escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
-    return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
+    escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
+    return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
  
  def quote_path(path, quotemarks=True):
-    return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
+    return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
  
  
  def unicode_platform():