2 Functions used to convert inputs from whatever encoding used in the system to
9 from allmydata.util.assertutil import precondition
10 from twisted.python import usage
12 from allmydata.util import log
15 def _canonical_encoding(encoding):
17 log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
19 encoding = encoding.lower()
20 if encoding == "cp65001":
22 elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
25 # sometimes Python returns an encoding name that it doesn't support for conversion
26 # fail early if this happens
28 u"test".encode(encoding)
29 except (LookupError, AttributeError):
30 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
34 filesystem_encoding = None
35 output_encoding = None
37 is_unicode_platform = False
40 global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
42 filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
45 if hasattr(sys.stdout, 'encoding'):
46 outenc = sys.stdout.encoding
49 outenc = locale.getpreferredencoding()
51 pass # work around <http://bugs.python.org/issue1443504>
52 output_encoding = _canonical_encoding(outenc)
54 if sys.platform == 'win32':
55 # Unicode arguments are not supported on Windows yet; see #565 and #1074.
56 argv_encoding = 'ascii'
58 argv_encoding = output_encoding
59 is_unicode_platform = sys.platform in ["win32", "darwin"]
64 def get_filesystem_encoding():
66 Returns expected encoding for local filenames.
68 return filesystem_encoding
70 def get_output_encoding():
72 Returns expected encoding for writing to stdout or stderr.
74 return output_encoding
76 def get_argv_encoding():
78 Returns expected encoding for command-line arguments.
82 def argv_to_unicode(s):
84 Decode given argv element to unicode. If this fails, raise a UsageError.
86 precondition(isinstance(s, str), s)
89 return unicode(s, argv_encoding)
90 except UnicodeDecodeError:
91 raise usage.UsageError("Argument %s cannot be decoded as %s." %
92 (quote_output(s), argv_encoding))
94 def unicode_to_url(s):
96 Encode an unicode object used in an URL.
98 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
102 #precondition(isinstance(s, unicode), s)
103 #return s.encode('utf-8')
106 if s is None or isinstance(s, str):
108 return s.encode('utf-8')
111 if isinstance(s, str):
113 return s.encode(argv_encoding)
115 PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
116 PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
118 def is_printable_ascii(s):
119 return PRINTABLE_ASCII.search(s) is not None
121 def unicode_to_output(s):
123 Encode an unicode object for representation on stdout or stderr.
125 precondition(isinstance(s, unicode), s)
128 out = s.encode(output_encoding)
129 except (UnicodeEncodeError, UnicodeDecodeError):
130 raise UnicodeEncodeError(output_encoding, s, 0, 0,
131 "A string could not be encoded as %s for output to the terminal:\n%r" %
132 (output_encoding, repr(s)))
134 if PRINTABLE_8BIT.search(out) is None:
135 raise UnicodeEncodeError(output_encoding, s, 0, 0,
136 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
137 (output_encoding, repr(s)))
140 def quote_output(s, quotemarks=True, encoding=None):
142 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
143 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
144 always surrounded by single quotes; otherwise, it is quoted only if necessary to
145 avoid ambiguity or control bytes in the output.
147 precondition(isinstance(s, (str, unicode)), s)
149 if isinstance(s, str):
151 s = s.decode('utf-8')
152 except UnicodeDecodeError:
156 out = s.encode(encoding or output_encoding)
157 except (UnicodeEncodeError, UnicodeDecodeError):
160 if PRINTABLE_8BIT.search(out) is None:
164 return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
168 def quote_path(path, quotemarks=True):
169 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
172 def unicode_platform():
174 Does the current platform handle Unicode filenames natively?
176 return is_unicode_platform
178 class FilenameEncodingError(Exception):
180 Filename cannot be encoded using the current encoding of your filesystem
181 (%s). Please configure your locale correctly or rename this file.
185 def listdir_unicode_fallback(path):
187 This function emulates a fallback Unicode API similar to one available
188 under Windows or MacOS X.
190 If badly encoded filenames are encountered, an exception is raised.
192 precondition(isinstance(path, unicode), path)
195 byte_path = path.encode(filesystem_encoding)
196 except (UnicodeEncodeError, UnicodeDecodeError):
197 raise FilenameEncodingError(path)
200 return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
201 except UnicodeDecodeError:
202 raise FilenameEncodingError(fn)
204 def listdir_unicode(path):
206 Wrapper around listdir() which provides safe access to the convenient
207 Unicode API even under platforms that don't provide one natively.
209 precondition(isinstance(path, unicode), path)
211 # On Windows and MacOS X, the Unicode API is used
212 # On other platforms (ie. Unix systems), the byte-level API is used
214 if is_unicode_platform:
215 return os.listdir(path)
217 return listdir_unicode_fallback(path)