2 Functions used to convert inputs from whatever encoding used in the system to
9 from allmydata.util.assertutil import precondition
10 from twisted.python import usage
12 from allmydata.util import log
13 from allmydata.util.fileutil import abspath_expanduser_unicode
16 def _canonical_encoding(encoding):
18 log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
20 encoding = encoding.lower()
21 if encoding == "cp65001":
23 elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
26 # sometimes Python returns an encoding name that it doesn't support for conversion
27 # fail early if this happens
29 u"test".encode(encoding)
30 except (LookupError, AttributeError):
31 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
35 filesystem_encoding = None
36 output_encoding = None
38 is_unicode_platform = False
41 global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
43 filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
46 if hasattr(sys.stdout, 'encoding'):
47 outenc = sys.stdout.encoding
50 outenc = locale.getpreferredencoding()
52 pass # work around <http://bugs.python.org/issue1443504>
53 output_encoding = _canonical_encoding(outenc)
55 if sys.platform == 'win32':
56 # Unicode arguments are not supported on Windows yet; see #565 and #1074.
57 argv_encoding = 'ascii'
59 argv_encoding = output_encoding
60 is_unicode_platform = sys.platform in ["win32", "darwin"]
65 def get_filesystem_encoding():
67 Returns expected encoding for local filenames.
69 return filesystem_encoding
71 def get_output_encoding():
73 Returns expected encoding for writing to stdout or stderr.
75 return output_encoding
77 def get_argv_encoding():
79 Returns expected encoding for command-line arguments.
83 def argv_to_unicode(s):
85 Decode given argv element to unicode. If this fails, raise a UsageError.
87 precondition(isinstance(s, str), s)
90 return unicode(s, argv_encoding)
91 except UnicodeDecodeError:
92 raise usage.UsageError("Argument %s cannot be decoded as %s." %
93 (quote_output(s), argv_encoding))
95 def argv_to_abspath(s):
97 Convenience function to decode an argv element to an absolute path, with ~ expanded.
98 If this fails, raise a UsageError.
100 return abspath_expanduser_unicode(argv_to_unicode(s))
102 def unicode_to_url(s):
104 Encode an unicode object used in an URL.
106 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
110 #precondition(isinstance(s, unicode), s)
111 #return s.encode('utf-8')
114 if s is None or isinstance(s, str):
116 return s.encode('utf-8')
119 if isinstance(s, str):
121 return s.encode(argv_encoding)
123 PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
124 PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
126 def is_printable_ascii(s):
127 return PRINTABLE_ASCII.search(s) is not None
129 def unicode_to_output(s):
131 Encode an unicode object for representation on stdout or stderr.
133 precondition(isinstance(s, unicode), s)
136 out = s.encode(output_encoding)
137 except (UnicodeEncodeError, UnicodeDecodeError):
138 raise UnicodeEncodeError(output_encoding, s, 0, 0,
139 "A string could not be encoded as %s for output to the terminal:\n%r" %
140 (output_encoding, repr(s)))
142 if PRINTABLE_8BIT.search(out) is None:
143 raise UnicodeEncodeError(output_encoding, s, 0, 0,
144 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
145 (output_encoding, repr(s)))
148 def quote_output(s, quotemarks=True, encoding=None):
150 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
151 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
152 always surrounded by single quotes; otherwise, it is quoted only if necessary to
153 avoid ambiguity or control bytes in the output.
155 precondition(isinstance(s, (str, unicode)), s)
157 if isinstance(s, str):
159 s = s.decode('utf-8')
160 except UnicodeDecodeError:
164 out = s.encode(encoding or output_encoding)
165 except (UnicodeEncodeError, UnicodeDecodeError):
168 if PRINTABLE_8BIT.search(out) is None:
172 return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
176 def quote_path(path, quotemarks=True):
177 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
180 def unicode_platform():
182 Does the current platform handle Unicode filenames natively?
184 return is_unicode_platform
186 class FilenameEncodingError(Exception):
188 Filename cannot be encoded using the current encoding of your filesystem
189 (%s). Please configure your locale correctly or rename this file.
193 def listdir_unicode_fallback(path):
195 This function emulates a fallback Unicode API similar to one available
196 under Windows or MacOS X.
198 If badly encoded filenames are encountered, an exception is raised.
200 precondition(isinstance(path, unicode), path)
203 byte_path = path.encode(filesystem_encoding)
204 except (UnicodeEncodeError, UnicodeDecodeError):
205 raise FilenameEncodingError(path)
208 return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
209 except UnicodeDecodeError:
210 raise FilenameEncodingError(fn)
212 def listdir_unicode(path):
214 Wrapper around listdir() which provides safe access to the convenient
215 Unicode API even under platforms that don't provide one natively.
217 precondition(isinstance(path, unicode), path)
219 # On Windows and MacOS X, the Unicode API is used
220 # On other platforms (ie. Unix systems), the byte-level API is used
222 if is_unicode_platform:
223 return os.listdir(path)
225 return listdir_unicode_fallback(path)