2 Functions used to convert inputs from whatever encoding used in the system to
8 from allmydata.util.assertutil import precondition
9 from twisted.python import usage
11 from allmydata.util import log
14 def _canonical_encoding(encoding):
16 log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
18 encoding = encoding.lower()
19 if encoding == "cp65001":
21 elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
24 # sometimes Python returns an encoding name that it doesn't support for conversion
25 # fail early if this happens
27 u"test".encode(encoding)
28 except (LookupError, AttributeError):
29 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
33 filesystem_encoding = None
34 output_encoding = None
36 is_unicode_platform = False
39 global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
41 filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
44 if hasattr(sys.stdout, 'encoding'):
45 outenc = sys.stdout.encoding
48 outenc = locale.getpreferredencoding()
50 pass # work around <http://bugs.python.org/issue1443504>
51 output_encoding = _canonical_encoding(outenc)
53 if sys.platform == 'win32':
54 # Unicode arguments are not supported on Windows yet; see #565 and #1074.
55 argv_encoding = 'ascii'
57 argv_encoding = output_encoding
58 is_unicode_platform = sys.platform in ["win32", "darwin"]
63 def get_filesystem_encoding():
65 Returns expected encoding for local filenames.
67 return filesystem_encoding
69 def get_output_encoding():
71 Returns expected encoding for writing to stdout or stderr.
73 return output_encoding
75 def get_argv_encoding():
77 Returns expected encoding for command-line arguments.
81 def argv_to_unicode(s):
83 Decode given argv element to unicode. If this fails, raise a UsageError.
85 precondition(isinstance(s, str), s)
88 return unicode(s, argv_encoding)
89 except UnicodeDecodeError:
90 raise usage.UsageError("Argument %s cannot be decoded as %s." %
91 (quote_output(s), argv_encoding))
93 def unicode_to_url(s):
95 Encode an unicode object used in an URL.
97 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
101 #precondition(isinstance(s, unicode), s)
102 #return s.encode('utf-8')
105 if s is None or isinstance(s, str):
107 return s.encode('utf-8')
110 if isinstance(s, str):
112 return s.encode(argv_encoding)
114 PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
115 PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
117 def is_printable_ascii(s):
118 return PRINTABLE_ASCII.search(s) is not None
120 def unicode_to_output(s):
122 Encode an unicode object for representation on stdout or stderr.
124 precondition(isinstance(s, unicode), s)
127 out = s.encode(output_encoding)
128 except (UnicodeEncodeError, UnicodeDecodeError):
129 raise UnicodeEncodeError(output_encoding, s, 0, 0,
130 "A string could not be encoded as %s for output to the terminal:\n%r" %
131 (output_encoding, repr(s)))
133 if PRINTABLE_8BIT.search(out) is None:
134 raise UnicodeEncodeError(output_encoding, s, 0, 0,
135 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
136 (output_encoding, repr(s)))
139 def quote_output(s, quotemarks=True, encoding=None):
141 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
142 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
143 always surrounded by single quotes; otherwise, it is quoted only if necessary to
144 avoid ambiguity or control bytes in the output.
146 precondition(isinstance(s, (str, unicode)), s)
148 if isinstance(s, str):
150 s = s.decode('utf-8')
151 except UnicodeDecodeError:
155 out = s.encode(encoding or output_encoding)
156 except (UnicodeEncodeError, UnicodeDecodeError):
159 if PRINTABLE_8BIT.search(out) is None:
163 return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
167 def quote_path(path, quotemarks=True):
168 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
171 def unicode_platform():
173 Does the current platform handle Unicode filenames natively?
175 return is_unicode_platform