2 Functions used to convert inputs from whatever encoding used in the system to
6 import sys, os, re, locale
7 from types import NoneType
9 from allmydata.util.assertutil import precondition, _assert
10 from twisted.python import usage
11 from twisted.python.filepath import FilePath
12 from allmydata.util import log
13 from allmydata.util.fileutil import abspath_expanduser_unicode
16 def canonical_encoding(encoding):
18 log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
20 encoding = encoding.lower()
21 if encoding == "cp65001":
23 elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
28 def check_encoding(encoding):
29 # sometimes Python returns an encoding name that it doesn't support for conversion
30 # fail early if this happens
32 u"test".encode(encoding)
33 except (LookupError, AttributeError):
34 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
36 filesystem_encoding = None
38 is_unicode_platform = False
39 use_unicode_filepath = False
42 global filesystem_encoding, io_encoding, is_unicode_platform, use_unicode_filepath
44 filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
45 check_encoding(filesystem_encoding)
47 if sys.platform == 'win32':
48 # On Windows we install UTF-8 stream wrappers for sys.stdout and
49 # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
53 if hasattr(sys.stdout, 'encoding'):
54 ioenc = sys.stdout.encoding
57 ioenc = locale.getpreferredencoding()
59 pass # work around <http://bugs.python.org/issue1443504>
60 io_encoding = canonical_encoding(ioenc)
62 check_encoding(io_encoding)
64 is_unicode_platform = sys.platform in ["win32", "darwin"]
66 # Despite the Unicode-mode FilePath support added to Twisted in
67 # <https://twistedmatrix.com/trac/ticket/7805>, we can't yet use
68 # Unicode-mode FilePaths with INotify on non-Windows platforms
69 # due to <https://twistedmatrix.com/trac/ticket/7928>.
70 use_unicode_filepath = sys.platform == "win32"
75 def get_filesystem_encoding():
77 Returns expected encoding for local filenames.
79 return filesystem_encoding
81 def get_io_encoding():
83 Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
87 def argv_to_unicode(s):
89 Decode given argv element to unicode. If this fails, raise a UsageError.
91 precondition(isinstance(s, str), s)
94 return unicode(s, io_encoding)
95 except UnicodeDecodeError:
96 raise usage.UsageError("Argument %s cannot be decoded as %s." %
97 (quote_output(s), io_encoding))
98 if local_dir.startswith('-'):
99 raise usage.UsageError("Argument %s cannot start with a -." % (quote_output(s),))
101 def argv_to_abspath(s, **kwargs):
103 Convenience function to decode an argv element to an absolute path, with ~ expanded.
104 If this fails, raise a UsageError.
106 decoded = argv_to_unicode(s)
107 if decoded.startswith(u'-'):
108 raise usage.UsageError("Path argument %s cannot start with '-'.\nUse %s if you intended to refer to a file."
109 % (quote_output(s), quote_output(os.path.join('.', s))))
110 return abspath_expanduser_unicode(decoded, **kwargs)
112 def unicode_to_argv(s, mangle=False):
114 Encode the given Unicode argument as a bytestring.
115 If the argument is to be passed to a different process, then the 'mangle' argument
116 should be true; on Windows, this uses a mangled encoding that will be reversed by
119 precondition(isinstance(s, unicode), s)
121 if mangle and sys.platform == "win32":
122 # This must be the same as 'mangle' in bin/tahoe-script.template.
123 return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
125 return s.encode(io_encoding)
127 def unicode_to_url(s):
129 Encode an unicode object used in an URL.
131 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
135 #precondition(isinstance(s, unicode), s)
136 #return s.encode('utf-8')
139 if s is None or isinstance(s, str):
141 return s.encode('utf-8')
143 def from_utf8_or_none(s):
144 precondition(isinstance(s, (NoneType, str)), s)
147 return s.decode('utf-8')
149 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
150 PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
152 def is_printable_ascii(s):
153 return PRINTABLE_ASCII.search(s) is not None
155 def unicode_to_output(s):
157 Encode an unicode object for representation on stdout or stderr.
159 precondition(isinstance(s, unicode), s)
162 out = s.encode(io_encoding)
163 except (UnicodeEncodeError, UnicodeDecodeError):
164 raise UnicodeEncodeError(io_encoding, s, 0, 0,
165 "A string could not be encoded as %s for output to the terminal:\n%r" %
166 (io_encoding, repr(s)))
168 if PRINTABLE_8BIT.search(out) is None:
169 raise UnicodeEncodeError(io_encoding, s, 0, 0,
170 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
171 (io_encoding, repr(s)))
175 def _unicode_escape(m, quote_newlines):
177 if u == u'"' or u == u'$' or u == u'`' or u == u'\\':
179 elif u == u'\n' and not quote_newlines:
182 codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
185 if codepoint > 0xFFFF:
186 return u'\\U%08x' % (codepoint,)
187 elif codepoint > 0xFF:
188 return u'\\u%04x' % (codepoint,)
190 return u'\\x%02x' % (codepoint,)
192 def _str_escape(m, quote_newlines):
194 if c == '"' or c == '$' or c == '`' or c == '\\':
196 elif c == '\n' and not quote_newlines:
199 return '\\x%02x' % (ord(c),)
201 MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
202 MUST_DOUBLE_QUOTE = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
204 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
205 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs
206 ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
209 ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
211 def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
213 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
214 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
215 always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
216 control bytes in the output. (Newlines are counted as control bytes iff
217 quote_newlines is True.)
219 Quoting may use either single or double quotes. Within single quotes, all
220 characters stand for themselves, and ' will not appear. Within double quotes,
221 Python-compatible backslash escaping is used.
223 If not explicitly given, quote_newlines is True when quotemarks is True.
225 precondition(isinstance(s, (str, unicode)), s)
226 if quote_newlines is None:
227 quote_newlines = quotemarks
229 if isinstance(s, str):
231 s = s.decode('utf-8')
232 except UnicodeDecodeError:
233 return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),)
235 must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
236 if must_double_quote.search(s) is None:
238 out = s.encode(encoding or io_encoding)
239 if quotemarks or out.startswith('"'):
240 return "'%s'" % (out,)
243 except (UnicodeDecodeError, UnicodeEncodeError):
246 escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
247 return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
249 def quote_path(path, quotemarks=True):
250 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True)
252 def quote_local_unicode_path(path, quotemarks=True):
253 precondition(isinstance(path, unicode), path)
255 if sys.platform == "win32" and path.startswith(u"\\\\?\\"):
257 if path.startswith(u"UNC\\"):
258 path = u"\\\\" + path[4 :]
260 return quote_output(path, quotemarks=quotemarks, quote_newlines=True)
262 def quote_filepath(path, quotemarks=True):
263 return quote_local_unicode_path(unicode_from_filepath(path), quotemarks=quotemarks)
265 def extend_filepath(fp, segments):
266 # We cannot use FilePath.preauthChild, because
267 # * it has the security flaw described in <https://twistedmatrix.com/trac/ticket/6527>;
268 # * it may return a FilePath in the wrong mode.
270 for segment in segments:
271 fp = fp.child(segment)
273 if isinstance(fp.path, unicode) and not use_unicode_filepath:
274 return FilePath(fp.path.encode(filesystem_encoding))
278 def to_filepath(path):
279 precondition(isinstance(path, basestring), path=path)
281 if isinstance(path, unicode) and not use_unicode_filepath:
282 path = path.encode(filesystem_encoding)
284 return FilePath(path)
287 precondition(isinstance(s, basestring), s=s)
289 if isinstance(s, bytes):
290 return s.decode(filesystem_encoding)
294 def unicode_from_filepath(fp):
295 precondition(isinstance(fp, FilePath), fp=fp)
296 return _decode(fp.path)
298 def unicode_segments_from(base_fp, ancestor_fp):
299 precondition(isinstance(base_fp, FilePath), base_fp=base_fp)
300 precondition(isinstance(ancestor_fp, FilePath), ancestor_fp=ancestor_fp)
302 if hasattr(FilePath, 'asTextMode'):
303 return base_fp.asTextMode().segmentsFrom(ancestor_fp.asTextMode())
305 bpt, apt = (type(base_fp.path), type(ancestor_fp.path))
306 _assert(bpt == apt, bpt=bpt, apt=apt)
307 return map(_decode, base_fp.segmentsFrom(ancestor_fp))
309 def unicode_platform():
311 Does the current platform handle Unicode filenames natively?
313 return is_unicode_platform
315 class FilenameEncodingError(Exception):
317 Filename cannot be encoded using the current encoding of your filesystem
318 (%s). Please configure your locale correctly or rename this file.
322 def listdir_unicode_fallback(path):
324 This function emulates a fallback Unicode API similar to one available
325 under Windows or MacOS X.
327 If badly encoded filenames are encountered, an exception is raised.
329 precondition(isinstance(path, unicode), path)
332 byte_path = path.encode(filesystem_encoding)
333 except (UnicodeEncodeError, UnicodeDecodeError):
334 raise FilenameEncodingError(path)
337 return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
338 except UnicodeDecodeError:
339 raise FilenameEncodingError(fn)
341 def listdir_unicode(path):
343 Wrapper around listdir() which provides safe access to the convenient
344 Unicode API even under platforms that don't provide one natively.
346 precondition(isinstance(path, unicode), path)
348 # On Windows and MacOS X, the Unicode API is used
349 # On other platforms (ie. Unix systems), the byte-level API is used
351 if is_unicode_platform:
352 return os.listdir(path)
354 return listdir_unicode_fallback(path)
356 def listdir_filepath(fp):
357 return listdir_unicode(unicode_from_filepath(fp))