2 Functions used to convert inputs from whatever encoding used in the system to
9 from allmydata.util.assertutil import precondition
10 from twisted.python import usage
12 from allmydata.util import log
13 from allmydata.util.fileutil import abspath_expanduser_unicode
16 def canonical_encoding(encoding):
18 log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
20 encoding = encoding.lower()
21 if encoding == "cp65001":
23 elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
28 def check_encoding(encoding):
29 # sometimes Python returns an encoding name that it doesn't support for conversion
30 # fail early if this happens
32 u"test".encode(encoding)
33 except (LookupError, AttributeError):
34 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
36 filesystem_encoding = None
38 is_unicode_platform = False
41 global filesystem_encoding, io_encoding, is_unicode_platform
43 filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
44 check_encoding(filesystem_encoding)
46 if sys.platform == 'win32':
47 # On Windows we install UTF-8 stream wrappers for sys.stdout and
48 # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
52 if hasattr(sys.stdout, 'encoding'):
53 ioenc = sys.stdout.encoding
56 ioenc = locale.getpreferredencoding()
58 pass # work around <http://bugs.python.org/issue1443504>
59 io_encoding = canonical_encoding(ioenc)
61 check_encoding(io_encoding)
63 is_unicode_platform = sys.platform in ["win32", "darwin"]
68 def get_filesystem_encoding():
70 Returns expected encoding for local filenames.
72 return filesystem_encoding
74 def get_io_encoding():
76 Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
80 def argv_to_unicode(s):
82 Decode given argv element to unicode. If this fails, raise a UsageError.
84 precondition(isinstance(s, str), s)
87 return unicode(s, io_encoding)
88 except UnicodeDecodeError:
89 raise usage.UsageError("Argument %s cannot be decoded as %s." %
90 (quote_output(s), io_encoding))
92 def argv_to_abspath(s):
94 Convenience function to decode an argv element to an absolute path, with ~ expanded.
95 If this fails, raise a UsageError.
97 return abspath_expanduser_unicode(argv_to_unicode(s))
99 def unicode_to_argv(s, mangle=False):
101 Encode the given Unicode argument as a bytestring.
102 If the argument is to be passed to a different process, then the 'mangle' argument
103 should be true; on Windows, this uses a mangled encoding that will be reversed by
106 precondition(isinstance(s, unicode), s)
108 if mangle and sys.platform == "win32":
109 # This must be the same as 'mangle' in bin/tahoe-script.template.
110 return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
112 return s.encode(io_encoding)
114 def unicode_to_url(s):
116 Encode an unicode object used in an URL.
118 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
122 #precondition(isinstance(s, unicode), s)
123 #return s.encode('utf-8')
126 if s is None or isinstance(s, str):
128 return s.encode('utf-8')
130 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
131 PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
133 def is_printable_ascii(s):
134 return PRINTABLE_ASCII.search(s) is not None
136 def unicode_to_output(s):
138 Encode an unicode object for representation on stdout or stderr.
140 precondition(isinstance(s, unicode), s)
143 out = s.encode(io_encoding)
144 except (UnicodeEncodeError, UnicodeDecodeError):
145 raise UnicodeEncodeError(io_encoding, s, 0, 0,
146 "A string could not be encoded as %s for output to the terminal:\n%r" %
147 (io_encoding, repr(s)))
149 if PRINTABLE_8BIT.search(out) is None:
150 raise UnicodeEncodeError(io_encoding, s, 0, 0,
151 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
152 (io_encoding, repr(s)))
156 def _unicode_escape(m):
158 if u == '"' or u == '$' or u == '`' or u == '\\':
161 codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
164 if codepoint > 0xFFFF:
165 return u'\\U%08x' % (codepoint,)
166 elif codepoint > 0xFF:
167 return u'\\u%04x' % (codepoint,)
169 return u'\\x%02x' % (codepoint,)
173 if c == '"' or c == '$' or c == '`' or c == '\\':
176 return '\\x%02x' % (ord(c),)
178 MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
180 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
181 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs
182 ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
185 ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
187 def quote_output(s, quotemarks=True, encoding=None):
189 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
190 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
191 always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
192 control bytes in the output.
193 Quoting may use either single or double quotes. Within single quotes, all
194 characters stand for themselves, and ' will not appear. Within double quotes,
195 Python-compatible backslash escaping is used.
197 precondition(isinstance(s, (str, unicode)), s)
199 if isinstance(s, str):
201 s = s.decode('utf-8')
202 except UnicodeDecodeError:
203 return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
205 if MUST_DOUBLE_QUOTE.search(s) is None:
207 out = s.encode(encoding or io_encoding)
208 if quotemarks or out.startswith('"'):
209 return "'%s'" % (out,)
212 except (UnicodeDecodeError, UnicodeEncodeError):
215 escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
216 return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),)
218 def quote_path(path, quotemarks=True):
219 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
222 def unicode_platform():
224 Does the current platform handle Unicode filenames natively?
226 return is_unicode_platform
228 class FilenameEncodingError(Exception):
230 Filename cannot be encoded using the current encoding of your filesystem
231 (%s). Please configure your locale correctly or rename this file.
235 def listdir_unicode_fallback(path):
237 This function emulates a fallback Unicode API similar to one available
238 under Windows or MacOS X.
240 If badly encoded filenames are encountered, an exception is raised.
242 precondition(isinstance(path, unicode), path)
245 byte_path = path.encode(filesystem_encoding)
246 except (UnicodeEncodeError, UnicodeDecodeError):
247 raise FilenameEncodingError(path)
250 return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
251 except UnicodeDecodeError:
252 raise FilenameEncodingError(fn)
254 def listdir_unicode(path):
256 Wrapper around listdir() which provides safe access to the convenient
257 Unicode API even under platforms that don't provide one natively.
259 precondition(isinstance(path, unicode), path)
261 # On Windows and MacOS X, the Unicode API is used
262 # On other platforms (ie. Unix systems), the byte-level API is used
264 if is_unicode_platform:
265 return os.listdir(path)
267 return listdir_unicode_fallback(path)