2 Functions used to convert inputs from whatever encoding used in the system to
9 from allmydata.util.assertutil import precondition
10 from twisted.python import usage
12 from allmydata.util import log
13 from allmydata.util.fileutil import abspath_expanduser_unicode
16 def canonical_encoding(encoding):
18 log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
20 encoding = encoding.lower()
21 if encoding == "cp65001":
23 elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
28 def check_encoding(encoding):
29 # sometimes Python returns an encoding name that it doesn't support for conversion
30 # fail early if this happens
32 u"test".encode(encoding)
33 except (LookupError, AttributeError):
34 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
36 filesystem_encoding = None
37 output_encoding = None
39 is_unicode_platform = False
42 global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
44 filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
45 check_encoding(filesystem_encoding)
47 if sys.platform == 'win32':
48 # On Windows we install UTF-8 stream wrappers for sys.stdout and
49 # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
50 output_encoding = 'utf-8'
53 if hasattr(sys.stdout, 'encoding'):
54 outenc = sys.stdout.encoding
57 outenc = locale.getpreferredencoding()
59 pass # work around <http://bugs.python.org/issue1443504>
60 output_encoding = canonical_encoding(outenc)
62 check_encoding(output_encoding)
63 argv_encoding = output_encoding
65 is_unicode_platform = sys.platform in ["win32", "darwin"]
70 def get_filesystem_encoding():
72 Returns expected encoding for local filenames.
74 return filesystem_encoding
76 def get_output_encoding():
78 Returns expected encoding for writing to stdout or stderr.
80 return output_encoding
82 def get_argv_encoding():
84 Returns expected encoding for command-line arguments.
88 def argv_to_unicode(s):
90 Decode given argv element to unicode. If this fails, raise a UsageError.
92 precondition(isinstance(s, str), s)
95 return unicode(s, argv_encoding)
96 except UnicodeDecodeError:
97 raise usage.UsageError("Argument %s cannot be decoded as %s." %
98 (quote_output(s), argv_encoding))
100 def argv_to_abspath(s):
102 Convenience function to decode an argv element to an absolute path, with ~ expanded.
103 If this fails, raise a UsageError.
105 return abspath_expanduser_unicode(argv_to_unicode(s))
107 def unicode_to_argv(s, mangle=False):
109 Encode the given Unicode argument as a bytestring.
110 If the argument is to be passed to a different process, then the 'mangle' argument
111 should be true; on Windows, this uses a mangled encoding that will be reversed by
114 precondition(isinstance(s, unicode), s)
116 if mangle and sys.platform == "win32":
117 # This must be the same as 'mangle' in bin/tahoe-script.template.
118 return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
120 return s.encode(argv_encoding)
122 def unicode_to_url(s):
124 Encode an unicode object used in an URL.
126 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
130 #precondition(isinstance(s, unicode), s)
131 #return s.encode('utf-8')
134 if s is None or isinstance(s, str):
136 return s.encode('utf-8')
138 PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
139 PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
141 def is_printable_ascii(s):
142 return PRINTABLE_ASCII.search(s) is not None
144 def unicode_to_output(s):
146 Encode an unicode object for representation on stdout or stderr.
148 precondition(isinstance(s, unicode), s)
151 out = s.encode(output_encoding)
152 except (UnicodeEncodeError, UnicodeDecodeError):
153 raise UnicodeEncodeError(output_encoding, s, 0, 0,
154 "A string could not be encoded as %s for output to the terminal:\n%r" %
155 (output_encoding, repr(s)))
157 if PRINTABLE_8BIT.search(out) is None:
158 raise UnicodeEncodeError(output_encoding, s, 0, 0,
159 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
160 (output_encoding, repr(s)))
164 def _unicode_escape(m):
166 if u == '"' or u == '$' or u == '`' or u == '\\':
169 codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
172 if codepoint > 0xFFFF:
173 return u'\\U%08x' % (codepoint,)
174 elif codepoint > 0xFF:
175 return u'\\u%04x' % (codepoint,)
177 return u'\\x%02x' % (codepoint,)
181 if c == '"' or c == '$' or c == '`' or c == '\\':
184 return '\\x%02x' % (ord(c),)
186 MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
188 # if we must double-quote, then we have to escape ", $ and `, but need not escape '
189 ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs
190 ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
193 ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
195 def quote_output(s, quotemarks=True, encoding=None):
197 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
198 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
199 always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
200 control bytes in the output.
201 Quoting may use either single or double quotes. Within single quotes, all
202 characters stand for themselves, and ' will not appear. Within double quotes,
203 Python-compatible backslash escaping is used.
205 precondition(isinstance(s, (str, unicode)), s)
207 if isinstance(s, str):
209 s = s.decode('utf-8')
210 except UnicodeDecodeError:
211 return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
213 if MUST_DOUBLE_QUOTE.search(s) is None:
215 out = s.encode(encoding or output_encoding)
216 if quotemarks or out.startswith('"'):
217 return "'%s'" % (out,)
220 except (UnicodeDecodeError, UnicodeEncodeError):
223 escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
224 return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
226 def quote_path(path, quotemarks=True):
227 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
230 def unicode_platform():
232 Does the current platform handle Unicode filenames natively?
234 return is_unicode_platform
236 class FilenameEncodingError(Exception):
238 Filename cannot be encoded using the current encoding of your filesystem
239 (%s). Please configure your locale correctly or rename this file.
243 def listdir_unicode_fallback(path):
245 This function emulates a fallback Unicode API similar to one available
246 under Windows or MacOS X.
248 If badly encoded filenames are encountered, an exception is raised.
250 precondition(isinstance(path, unicode), path)
253 byte_path = path.encode(filesystem_encoding)
254 except (UnicodeEncodeError, UnicodeDecodeError):
255 raise FilenameEncodingError(path)
258 return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
259 except UnicodeDecodeError:
260 raise FilenameEncodingError(fn)
262 def listdir_unicode(path):
264 Wrapper around listdir() which provides safe access to the convenient
265 Unicode API even under platforms that don't provide one natively.
267 precondition(isinstance(path, unicode), path)
269 # On Windows and MacOS X, the Unicode API is used
270 # On other platforms (ie. Unix systems), the byte-level API is used
272 if is_unicode_platform:
273 return os.listdir(path)
275 return listdir_unicode_fallback(path)