2 Functions used to convert inputs from whatever encoding used in the system to
9 from allmydata.util.assertutil import precondition
10 from twisted.python import usage
12 from allmydata.util import log
13 from allmydata.util.fileutil import abspath_expanduser_unicode
16 def _canonical_encoding(encoding):
18 log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
20 encoding = encoding.lower()
21 if encoding == "cp65001":
23 elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
26 # sometimes Python returns an encoding name that it doesn't support for conversion
27 # fail early if this happens
29 u"test".encode(encoding)
30 except (LookupError, AttributeError):
31 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
35 filesystem_encoding = None
36 output_encoding = None
38 is_unicode_platform = False
41 global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
43 filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
46 if hasattr(sys.stdout, 'encoding'):
47 outenc = sys.stdout.encoding
50 outenc = locale.getpreferredencoding()
52 pass # work around <http://bugs.python.org/issue1443504>
53 output_encoding = _canonical_encoding(outenc)
55 if sys.platform == 'win32':
56 # Unicode arguments are not supported on Windows yet; see #565 and #1074.
57 argv_encoding = 'ascii'
59 argv_encoding = output_encoding
60 is_unicode_platform = sys.platform in ["win32", "darwin"]
65 def get_filesystem_encoding():
67 Returns expected encoding for local filenames.
69 return filesystem_encoding
71 def get_output_encoding():
73 Returns expected encoding for writing to stdout or stderr.
75 return output_encoding
77 def get_argv_encoding():
79 Returns expected encoding for command-line arguments.
83 def argv_to_unicode(s):
85 Decode given argv element to unicode. If this fails, raise a UsageError.
87 precondition(isinstance(s, str), s)
90 return unicode(s, argv_encoding)
91 except UnicodeDecodeError:
92 raise usage.UsageError("Argument %s cannot be decoded as %s." %
93 (quote_output(s), argv_encoding))
95 def argv_to_abspath(s):
97 Convenience function to decode an argv element to an absolute path, with ~ expanded.
98 If this fails, raise a UsageError.
100 return abspath_expanduser_unicode(argv_to_unicode(s))
102 def unicode_to_argv(s, mangle=False):
104 Encode the given Unicode argument as a bytestring.
105 If the argument is to be passed to a different process, then the 'mangle' argument
106 should be true; on Windows, this uses a mangled encoding that will be reversed by
109 precondition(isinstance(s, unicode), s)
111 if mangle and sys.platform == "win32":
112 # This must be the same as 'mangle' in bin/tahoe-script.template.
113 return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
115 return s.encode(argv_encoding)
117 def unicode_to_url(s):
119 Encode an unicode object used in an URL.
121 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
125 #precondition(isinstance(s, unicode), s)
126 #return s.encode('utf-8')
129 if s is None or isinstance(s, str):
131 return s.encode('utf-8')
133 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
134 PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
136 def is_printable_ascii(s):
137 return PRINTABLE_ASCII.search(s) is not None
139 def unicode_to_output(s):
141 Encode an unicode object for representation on stdout or stderr.
143 precondition(isinstance(s, unicode), s)
146 out = s.encode(output_encoding)
147 except (UnicodeEncodeError, UnicodeDecodeError):
148 raise UnicodeEncodeError(output_encoding, s, 0, 0,
149 "A string could not be encoded as %s for output to the terminal:\n%r" %
150 (output_encoding, repr(s)))
152 if PRINTABLE_8BIT.search(out) is None:
153 raise UnicodeEncodeError(output_encoding, s, 0, 0,
154 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
155 (output_encoding, repr(s)))
159 def _unicode_escape(m):
161 if u == '"' or u == '$' or u == '`' or u == '\\':
164 codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
167 if codepoint > 0xFFFF:
168 return u'\\U%08x' % (codepoint,)
169 elif codepoint > 0xFF:
170 return u'\\u%04x' % (codepoint,)
172 return u'\\x%02x' % (codepoint,)
176 if c == '"' or c == '$' or c == '`' or c == '\\':
179 return '\\x%02x' % (ord(c),)
181 MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
183 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
184 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs
185 ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
188 ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
190 def quote_output(s, quotemarks=True, encoding=None):
192 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
193 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
194 always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
195 control bytes in the output.
196 Quoting may use either single or double quotes. Within single quotes, all
197 characters stand for themselves, and ' will not appear. Within double quotes,
198 Python-compatible backslash escaping is used.
200 precondition(isinstance(s, (str, unicode)), s)
202 if isinstance(s, str):
204 s = s.decode('utf-8')
205 except UnicodeDecodeError:
206 return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
208 if MUST_DOUBLE_QUOTE.search(s) is None:
210 out = s.encode(encoding or output_encoding)
211 if quotemarks or out.startswith('"'):
212 return "'%s'" % (out,)
215 except (UnicodeDecodeError, UnicodeEncodeError):
218 escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
219 return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
221 def quote_path(path, quotemarks=True):
222 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
225 def unicode_platform():
227 Does the current platform handle Unicode filenames natively?
229 return is_unicode_platform
231 class FilenameEncodingError(Exception):
233 Filename cannot be encoded using the current encoding of your filesystem
234 (%s). Please configure your locale correctly or rename this file.
238 def listdir_unicode_fallback(path):
240 This function emulates a fallback Unicode API similar to one available
241 under Windows or MacOS X.
243 If badly encoded filenames are encountered, an exception is raised.
245 precondition(isinstance(path, unicode), path)
248 byte_path = path.encode(filesystem_encoding)
249 except (UnicodeEncodeError, UnicodeDecodeError):
250 raise FilenameEncodingError(path)
253 return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
254 except UnicodeDecodeError:
255 raise FilenameEncodingError(fn)
257 def listdir_unicode(path):
259 Wrapper around listdir() which provides safe access to the convenient
260 Unicode API even under platforms that don't provide one natively.
262 precondition(isinstance(path, unicode), path)
264 # On Windows and MacOS X, the Unicode API is used
265 # On other platforms (ie. Unix systems), the byte-level API is used
267 if is_unicode_platform:
268 return os.listdir(path)
270 return listdir_unicode_fallback(path)