2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
12 # The following main helps to generate a test class for other operating
15 if __name__ == "__main__":
21 if len(sys.argv) != 2:
22 print "Usage: %s lumi<e-grave>re" % sys.argv[0]
25 if sys.platform == "win32":
27 from allmydata.windows.fixups import initialize
29 print "set PYTHONPATH to the src directory"
34 print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35 print " uname = '%s'" % ' '.join(platform.uname())
36 print " argv = %s" % repr(sys.argv[1])
37 print " platform = '%s'" % sys.platform
38 print " filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39 print " io_encoding = '%s'" % sys.stdout.encoding
41 tmpdir = tempfile.mkdtemp()
42 for fname in TEST_FILENAMES:
43 open(os.path.join(tmpdir, fname), 'w').close()
45 # Use Unicode API under Windows or MacOS X
46 if sys.platform in ('win32', 'darwin'):
47 dirlist = os.listdir(unicode(tmpdir))
49 dirlist = os.listdir(tmpdir)
51 print " dirlist = %s" % repr(dirlist)
53 print " # Oops, I cannot write filenames containing non-ascii characters"
59 from twisted.trial import unittest
60 from mock import patch
61 import os, sys, locale
63 from allmydata.test.common_util import ReallyEqualMixin
64 from allmydata.util import encodingutil, fileutil
65 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
66 unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
67 unicode_platform, listdir_unicode, FilenameEncodingError, get_io_encoding, \
68 get_filesystem_encoding, to_str, from_utf8_or_none, _reload
69 from allmydata.dirnode import normalize
71 from twisted.python import usage
73 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
76 def test_get_io_encoding(self, mock_stdout):
77 mock_stdout.encoding = 'UTF-8'
79 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
81 mock_stdout.encoding = 'cp65001'
83 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
85 mock_stdout.encoding = 'koi8-r'
86 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
88 self.failUnlessReallyEqual(get_io_encoding(), expected)
90 mock_stdout.encoding = 'nonexistent_encoding'
91 if sys.platform == "win32":
93 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
95 self.failUnlessRaises(AssertionError, _reload)
97 @patch('locale.getpreferredencoding')
98 def test_get_io_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
99 locale # hush pyflakes
100 mock_locale_getpreferredencoding.return_value = 'koi8-r'
104 old_stdout = sys.stdout
105 sys.stdout = DummyStdout()
107 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
109 self.failUnlessReallyEqual(get_io_encoding(), expected)
111 sys.stdout.encoding = None
113 self.failUnlessReallyEqual(get_io_encoding(), expected)
115 mock_locale_getpreferredencoding.return_value = None
117 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
119 sys.stdout = old_stdout
121 def test_argv_to_unicode(self):
122 encodingutil.io_encoding = 'utf-8'
123 self.failUnlessRaises(usage.UsageError,
125 lumiere_nfc.encode('latin1'))
127 def test_unicode_to_output(self):
128 encodingutil.io_encoding = 'koi8-r'
129 self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
132 def test_no_unicode_normalization(self, mock):
133 # Pretend to run on a Unicode platform.
134 # We normalized to NFC in 1.7beta, but we now don't.
135 orig_platform = sys.platform
137 sys.platform = 'darwin'
138 mock.return_value = [Artonwall_nfd]
140 self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
142 sys.platform = orig_platform
144 # The following tests apply only to platforms that don't store filenames as
145 # Unicode entities on the filesystem.
146 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
148 # Mock sys.platform because unicode_platform() uses it
149 self.original_platform = sys.platform
150 sys.platform = 'linux'
153 sys.platform = self.original_platform
156 @patch('sys.getfilesystemencoding')
158 def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
159 # What happens if latin1-encoded filenames are encountered on an UTF-8
161 mock_listdir.return_value = [
162 lumiere_nfc.encode('utf-8'),
163 lumiere_nfc.encode('latin1')]
165 mock_getfilesystemencoding.return_value = 'utf-8'
167 self.failUnlessRaises(FilenameEncodingError,
171 # We're trying to list a directory whose name cannot be represented in
172 # the filesystem encoding. This should fail.
173 mock_getfilesystemencoding.return_value = 'ascii'
175 self.failUnlessRaises(FilenameEncodingError,
180 class EncodingUtil(ReallyEqualMixin):
182 self.original_platform = sys.platform
183 sys.platform = self.platform
186 sys.platform = self.original_platform
190 def test_argv_to_unicode(self, mock):
191 if 'argv' not in dir(self):
194 mock.encoding = self.io_encoding
198 self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
200 def test_unicode_to_url(self):
201 self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
204 def test_unicode_to_output(self, mock):
205 if 'argv' not in dir(self):
208 mock.encoding = self.io_encoding
210 self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
212 def test_unicode_platform(self):
222 self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
224 @patch('sys.getfilesystemencoding')
226 def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
227 if 'dirlist' not in dir(self):
231 u"test".encode(self.filesystem_encoding)
232 except (LookupError, AttributeError):
233 raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
234 "that we are testing for the benefit of a different platform."
235 % (self.filesystem_encoding,))
237 mock_listdir.return_value = self.dirlist
238 mock_getfilesystemencoding.return_value = self.filesystem_encoding
241 filenames = listdir_unicode(u'/dummy')
243 self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
247 class StdlibUnicode(unittest.TestCase):
248 """This mainly tests that some of the stdlib functions support Unicode paths, but also that
249 listdir_unicode works for valid filenames."""
251 def skip_if_cannot_represent_filename(self, u):
252 enc = get_filesystem_encoding()
253 if not unicode_platform():
256 except UnicodeEncodeError:
257 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
259 def test_mkdir_open_exists_abspath_listdir_expanduser(self):
260 self.skip_if_cannot_represent_filename(lumiere_nfc)
263 os.mkdir(lumiere_nfc)
264 except EnvironmentError, e:
265 raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
266 "does not support Unicode, even though the platform does." % (e,))
268 fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
269 open(fn, 'wb').close()
270 self.failUnless(os.path.exists(fn))
271 self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
272 filenames = listdir_unicode(lumiere_nfc)
274 # We only require that the listing includes a filename that is canonically equivalent
275 # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
276 self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
278 expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
279 self.failIfIn(u"~", expanded)
280 self.failUnless(expanded.endswith(lumiere_nfc), expanded)
282 def test_open_unrepresentable(self):
283 if unicode_platform():
284 raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
286 enc = get_filesystem_encoding()
290 raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
291 except UnicodeEncodeError:
292 self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
295 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
299 def _check(self, inp, out, enc, optional_quotes, quote_newlines):
303 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
304 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
307 elif isinstance(inp, str):
308 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
309 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
311 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
312 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
314 def _test_quote_output_all(self, enc):
315 def check(inp, out, optional_quotes=False, quote_newlines=None):
316 self._check(inp, out, enc, optional_quotes, quote_newlines)
318 # optional single quotes
319 check("foo", "'foo'", True)
320 check("\\", "'\\'", True)
321 check("$\"`", "'$\"`'", True)
322 check("\n", "'\n'", True, quote_newlines=False)
324 # mandatory single quotes
329 check("\n", "\"\\x0a\"", quote_newlines=True)
330 check("\x00", "\"\\x00\"")
332 # invalid Unicode and astral planes
333 check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"")
334 check(u"\uDC00\uD800", "\"\\udc00\\ud800\"")
335 check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
336 check(u"\uD800\uDC00", "\"\\U00010000\"")
337 check(u"\uD800\uDC01", "\"\\U00010001\"")
338 check(u"\uD801\uDC00", "\"\\U00010400\"")
339 check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"")
340 check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"")
341 check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"")
344 check("\xFF", "b\"\\xff\"")
345 check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
347 def test_quote_output_ascii(self, enc='ascii'):
348 def check(inp, out, optional_quotes=False, quote_newlines=None):
349 self._check(inp, out, enc, optional_quotes, quote_newlines)
351 self._test_quote_output_all(enc)
352 check(u"\u00D7", "\"\\xd7\"")
353 check(u"'\u00D7", "\"'\\xd7\"")
354 check(u"\"\u00D7", "\"\\\"\\xd7\"")
355 check(u"\u2621", "\"\\u2621\"")
356 check(u"'\u2621", "\"'\\u2621\"")
357 check(u"\"\u2621", "\"\\\"\\u2621\"")
358 check(u"\n", "'\n'", True, quote_newlines=False)
359 check(u"\n", "\"\\x0a\"", quote_newlines=True)
361 def test_quote_output_latin1(self, enc='latin1'):
362 def check(inp, out, optional_quotes=False, quote_newlines=None):
363 self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
365 self._test_quote_output_all(enc)
366 check(u"\u00D7", u"'\u00D7'", True)
367 check(u"'\u00D7", u"\"'\u00D7\"")
368 check(u"\"\u00D7", u"'\"\u00D7'")
369 check(u"\u00D7\"", u"'\u00D7\"'", True)
370 check(u"\u2621", u"\"\\u2621\"")
371 check(u"'\u2621", u"\"'\\u2621\"")
372 check(u"\"\u2621", u"\"\\\"\\u2621\"")
373 check(u"\n", u"'\n'", True, quote_newlines=False)
374 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
376 def test_quote_output_utf8(self, enc='utf-8'):
377 def check(inp, out, optional_quotes=False, quote_newlines=None):
378 self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines)
380 self._test_quote_output_all(enc)
381 check(u"\u2621", u"'\u2621'", True)
382 check(u"'\u2621", u"\"'\u2621\"")
383 check(u"\"\u2621", u"'\"\u2621'")
384 check(u"\u2621\"", u"'\u2621\"'", True)
385 check(u"\n", u"'\n'", True, quote_newlines=False)
386 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
388 def test_quote_output_default(self):
389 encodingutil.io_encoding = 'ascii'
390 self.test_quote_output_ascii(None)
392 encodingutil.io_encoding = 'latin1'
393 self.test_quote_output_latin1(None)
395 encodingutil.io_encoding = 'utf-8'
396 self.test_quote_output_utf8(None)
399 class QuotePaths(ReallyEqualMixin, unittest.TestCase):
400 def test_quote_path(self):
401 self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'")
402 self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'")
403 self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar")
404 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"')
405 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"')
406 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"')
408 def win32_other(win32, other):
409 return win32 if sys.platform == "win32" else other
411 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
412 win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
413 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
414 win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
415 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
416 win32_other("C:\\foo", "\\\\?\\C:\\foo"))
417 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
418 win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
419 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
420 win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
421 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
422 win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar"))
425 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
426 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
427 argv = 'lumi\xc3\xa8re'
429 filesystem_encoding = 'UTF-8'
430 io_encoding = 'UTF-8'
431 dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
433 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
434 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
437 filesystem_encoding = 'ISO-8859-1'
438 io_encoding = 'ISO-8859-1'
439 dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
441 class Windows(EncodingUtil, unittest.TestCase):
442 uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
443 argv = 'lumi\xc3\xa8re'
445 filesystem_encoding = 'mbcs'
446 io_encoding = 'utf-8'
447 dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
449 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
450 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
451 output = 'lumi\xc3\xa8re'
453 filesystem_encoding = 'utf-8'
454 io_encoding = 'UTF-8'
455 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
457 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
458 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
460 filesystem_encoding = 'utf-8'
461 io_encoding = 'US-ASCII'
462 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
464 class OpenBSD(EncodingUtil, unittest.TestCase):
465 uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
466 platform = 'openbsd4'
467 filesystem_encoding = '646'
469 # Oops, I cannot write filenames containing non-ascii characters
472 class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
473 def test_to_str(self):
474 self.failUnlessReallyEqual(to_str("foo"), "foo")
475 self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re")
476 self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want?
477 self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re")
478 self.failUnlessReallyEqual(to_str(None), None)
480 def test_from_utf8_or_none(self):
481 self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
482 self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re")
483 self.failUnlessReallyEqual(from_utf8_or_none(None), None)
484 self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF")