2 Functions used to convert inputs from whatever encoding used in the system to
10 from allmydata.util.assertutil import precondition
11 from twisted.python import usage
15 def _canonical_encoding(encoding):
18 encoding = encoding.lower()
19 if encoding == "cp65001":
21 elif encoding == "us-ascii" or encoding == "646":
24 # sometimes Python returns an encoding name that it doesn't support for conversion
25 # fail early if this happens
27 u"test".encode(encoding)
28 except (LookupError, AttributeError):
29 raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
33 filesystem_encoding = None
34 output_encoding = None
36 is_unicode_platform = False
39 global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
41 filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
42 output_encoding = _canonical_encoding(sys.stdout.encoding or locale.getpreferredencoding())
43 if sys.platform == 'win32':
44 # Unicode arguments are not supported on Windows yet; see #565 and #1074.
45 argv_encoding = 'ascii'
47 argv_encoding = output_encoding
48 is_unicode_platform = sys.platform in ["win32", "darwin"]
53 def get_filesystem_encoding():
55 Returns expected encoding for local filenames.
57 return filesystem_encoding
59 def get_output_encoding():
61 Returns expected encoding for writing to stdout or stderr.
63 return output_encoding
65 def get_argv_encoding():
67 Returns expected encoding for command-line arguments.
71 def argv_to_unicode(s):
73 Decode given argv element to unicode. If this fails, raise a UsageError.
75 precondition(isinstance(s, str), s)
78 return unicode(s, argv_encoding)
79 except UnicodeDecodeError:
80 raise usage.UsageError("Argument %s cannot be decoded as %s." %
81 (quote_output(s), argv_encoding))
83 def unicode_to_url(s):
85 Encode an unicode object used in an URL.
87 # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
91 #precondition(isinstance(s, unicode), s)
92 #return s.encode('utf-8')
95 if s is None or isinstance(s, str):
97 return s.encode('utf-8')
100 if isinstance(s, str):
102 return s.encode(argv_encoding)
104 PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
105 PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
107 def is_printable_ascii(s):
108 return PRINTABLE_ASCII.search(s) is not None
110 def unicode_to_output(s):
112 Encode an unicode object for representation on stdout or stderr.
114 precondition(isinstance(s, unicode), s)
117 out = s.encode(output_encoding)
118 except UnicodeEncodeError:
119 raise UnicodeEncodeError(output_encoding, s, 0, 0,
120 "A string could not be encoded as %s for output to the terminal:\n%r" %
121 (output_encoding, repr(s)))
123 if PRINTABLE_8BIT.search(out) is None:
124 raise UnicodeEncodeError(output_encoding, s, 0, 0,
125 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
126 (output_encoding, repr(s)))
129 def quote_output(s, quotemarks=True):
131 Encode either a Unicode string or a UTF-8-encoded bytestring for representation
132 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
133 always surrounded by single quotes; otherwise, it is quoted only if necessary to
134 avoid ambiguity or control bytes in the output.
136 precondition(isinstance(s, (str, unicode)), s)
138 if isinstance(s, str):
140 s = s.decode('utf-8')
141 except UnicodeDecodeError:
145 out = s.encode(output_encoding)
146 except UnicodeEncodeError:
149 if PRINTABLE_8BIT.search(out) is None:
153 return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
157 def quote_path(path, quotemarks=True):
158 return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
161 def unicode_platform():
163 Does the current platform handle Unicode filenames natively?
165 return is_unicode_platform
167 class FilenameEncodingError(Exception):
169 Filename cannot be encoded using the current encoding of your filesystem
170 (%s). Please configure your locale correctly or rename this file.
174 def listdir_unicode_fallback(path):
176 This function emulates a fallback Unicode API similar to one available
177 under Windows or MacOS X.
179 If badly encoded filenames are encountered, an exception is raised.
181 precondition(isinstance(path, unicode), path)
184 byte_path = path.encode(filesystem_encoding)
185 except UnicodeEncodeError:
186 raise FilenameEncodingError(path)
189 return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
190 except UnicodeDecodeError:
191 raise FilenameEncodingError(fn)
193 def listdir_unicode(path):
195 Wrapper around listdir() which provides safe access to the convenient
196 Unicode API even under platforms that don't provide one natively.
198 precondition(isinstance(path, unicode), path)
200 # On Windows and MacOS X, the Unicode API is used
201 # On other platforms (ie. Unix systems), the byte-level API is used
203 if is_unicode_platform:
204 dirlist = os.listdir(path)
206 dirlist = listdir_unicode_fallback(path)
208 # Normalize the resulting unicode filenames
210 # This prevents different OSes from generating non-equal unicode strings for
211 # the same filename representation
212 return [unicodedata.normalize('NFC', fname) for fname in dirlist]
214 def open_unicode(path, mode):
216 Wrapper around open() which provides safe access to the convenient Unicode
219 precondition(isinstance(path, unicode), path)
221 if is_unicode_platform:
222 return open(os.path.expanduser(path), mode)
225 return open(os.path.expanduser(path.encode(filesystem_encoding)), mode)
226 except UnicodeEncodeError:
227 raise FilenameEncodingError(path)
229 def abspath_expanduser_unicode(path):
230 precondition(isinstance(path, unicode), path)
232 if is_unicode_platform:
233 return os.path.abspath(os.path.expanduser(path))
236 pathstr = path.encode(filesystem_encoding)
237 return os.path.abspath(os.path.expanduser(pathstr)).decode(filesystem_encoding)
238 except (UnicodeEncodeError, UnicodeDecodeError):
239 raise FilenameEncodingError(path)