2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
12 # The following main helps to generate a test class for other operating
15 if __name__ == "__main__":
21 if len(sys.argv) != 2:
22 print "Usage: %s lumi<e-grave>re" % sys.argv[0]
25 if sys.platform == "win32":
27 from allmydata.windows.fixups import initialize
29 print "set PYTHONPATH to the src directory"
34 print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35 print " uname = '%s'" % ' '.join(platform.uname())
36 print " argv = %s" % repr(sys.argv[1])
37 print " platform = '%s'" % sys.platform
38 print " filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39 print " io_encoding = '%s'" % sys.stdout.encoding
41 tmpdir = tempfile.mkdtemp()
42 for fname in TEST_FILENAMES:
43 open(os.path.join(tmpdir, fname), 'w').close()
45 # Use Unicode API under Windows or MacOS X
46 if sys.platform in ('win32', 'darwin'):
47 dirlist = os.listdir(unicode(tmpdir))
49 dirlist = os.listdir(tmpdir)
51 print " dirlist = %s" % repr(dirlist)
53 print " # Oops, I cannot write filenames containing non-ascii characters"
60 import os, sys, locale
62 from twisted.trial import unittest
64 from allmydata.test.common_util import ReallyEqualMixin
65 from allmydata.util import encodingutil, fileutil
66 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
67 unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
68 unicode_platform, listdir_unicode, FilenameEncodingError, get_io_encoding, \
69 get_filesystem_encoding, to_str, from_utf8_or_none, _reload
70 from allmydata.dirnode import normalize
72 from twisted.python import usage
75 class MockStdout(object):
78 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
79 def test_get_io_encoding(self):
80 mock_stdout = MockStdout()
81 self.patch(sys, 'stdout', mock_stdout)
83 mock_stdout.encoding = 'UTF-8'
85 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
87 mock_stdout.encoding = 'cp65001'
89 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
91 mock_stdout.encoding = 'koi8-r'
92 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
94 self.failUnlessReallyEqual(get_io_encoding(), expected)
96 mock_stdout.encoding = 'nonexistent_encoding'
97 if sys.platform == "win32":
99 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
101 self.failUnlessRaises(AssertionError, _reload)
103 def test_get_io_encoding_not_from_stdout(self):
104 preferredencoding = 'koi8-r'
105 def call_locale_getpreferredencoding():
106 return preferredencoding
107 self.patch(locale, 'getpreferredencoding', call_locale_getpreferredencoding)
108 mock_stdout = MockStdout()
109 self.patch(sys, 'stdout', mock_stdout)
111 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
113 self.failUnlessReallyEqual(get_io_encoding(), expected)
115 mock_stdout.encoding = None
117 self.failUnlessReallyEqual(get_io_encoding(), expected)
119 preferredencoding = None
121 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
123 def test_argv_to_unicode(self):
124 encodingutil.io_encoding = 'utf-8'
125 self.failUnlessRaises(usage.UsageError,
127 lumiere_nfc.encode('latin1'))
129 def test_unicode_to_output(self):
130 encodingutil.io_encoding = 'koi8-r'
131 self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
133 def test_no_unicode_normalization(self):
134 # Pretend to run on a Unicode platform.
135 # listdir_unicode normalized to NFC in 1.7beta, but now doesn't.
137 def call_os_listdir(path):
138 return [Artonwall_nfd]
139 self.patch(os, 'listdir', call_os_listdir)
140 self.patch(sys, 'platform', 'darwin')
143 self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
146 # The following tests apply only to platforms that don't store filenames as
147 # Unicode entities on the filesystem.
148 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
150 # Mock sys.platform because unicode_platform() uses it
151 self.original_platform = sys.platform
152 sys.platform = 'linux'
155 sys.platform = self.original_platform
158 def test_listdir_unicode(self):
159 # What happens if latin1-encoded filenames are encountered on an UTF-8
161 def call_os_listdir(path):
163 lumiere_nfc.encode('utf-8'),
164 lumiere_nfc.encode('latin1')
166 self.patch(os, 'listdir', call_os_listdir)
168 sys_filesystemencoding = 'utf-8'
169 def call_sys_getfilesystemencoding():
170 return sys_filesystemencoding
171 self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding)
174 self.failUnlessRaises(FilenameEncodingError,
178 # We're trying to list a directory whose name cannot be represented in
179 # the filesystem encoding. This should fail.
180 sys_filesystemencoding = 'ascii'
182 self.failUnlessRaises(FilenameEncodingError,
187 class EncodingUtil(ReallyEqualMixin):
189 self.original_platform = sys.platform
190 sys.platform = self.platform
193 sys.platform = self.original_platform
196 def test_argv_to_unicode(self):
197 if 'argv' not in dir(self):
200 mock_stdout = MockStdout()
201 mock_stdout.encoding = self.io_encoding
202 self.patch(sys, 'stdout', mock_stdout)
207 self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
209 def test_unicode_to_url(self):
210 self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
212 def test_unicode_to_output(self):
213 if 'argv' not in dir(self):
216 mock_stdout = MockStdout()
217 mock_stdout.encoding = self.io_encoding
218 self.patch(sys, 'stdout', mock_stdout)
221 self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
223 def test_unicode_platform(self):
233 self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
235 def test_listdir_unicode(self):
236 if 'dirlist' not in dir(self):
240 u"test".encode(self.filesystem_encoding)
241 except (LookupError, AttributeError):
242 raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
243 "that we are testing for the benefit of a different platform."
244 % (self.filesystem_encoding,))
246 def call_os_listdir(path):
248 self.patch(os, 'listdir', call_os_listdir)
250 def call_sys_getfilesystemencoding():
251 return self.filesystem_encoding
252 self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding)
255 filenames = listdir_unicode(u'/dummy')
257 self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
261 class StdlibUnicode(unittest.TestCase):
262 """This mainly tests that some of the stdlib functions support Unicode paths, but also that
263 listdir_unicode works for valid filenames."""
265 def skip_if_cannot_represent_filename(self, u):
266 enc = get_filesystem_encoding()
267 if not unicode_platform():
270 except UnicodeEncodeError:
271 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
273 def test_mkdir_open_exists_abspath_listdir_expanduser(self):
274 self.skip_if_cannot_represent_filename(lumiere_nfc)
277 os.mkdir(lumiere_nfc)
278 except EnvironmentError, e:
279 raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
280 "does not support Unicode, even though the platform does." % (e,))
282 fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
283 open(fn, 'wb').close()
284 self.failUnless(os.path.exists(fn))
285 self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
286 filenames = listdir_unicode(lumiere_nfc)
288 # We only require that the listing includes a filename that is canonically equivalent
289 # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
290 self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
292 expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
293 self.failIfIn(u"~", expanded)
294 self.failUnless(expanded.endswith(lumiere_nfc), expanded)
296 def test_open_unrepresentable(self):
297 if unicode_platform():
298 raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
300 enc = get_filesystem_encoding()
304 raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
305 except UnicodeEncodeError:
306 self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
309 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
313 def _check(self, inp, out, enc, optional_quotes, quote_newlines):
317 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
318 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
321 elif isinstance(inp, str):
322 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
323 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
325 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
326 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
328 def _test_quote_output_all(self, enc):
329 def check(inp, out, optional_quotes=False, quote_newlines=None):
330 self._check(inp, out, enc, optional_quotes, quote_newlines)
332 # optional single quotes
333 check("foo", "'foo'", True)
334 check("\\", "'\\'", True)
335 check("$\"`", "'$\"`'", True)
336 check("\n", "'\n'", True, quote_newlines=False)
338 # mandatory single quotes
343 check("\n", "\"\\x0a\"", quote_newlines=True)
344 check("\x00", "\"\\x00\"")
346 # invalid Unicode and astral planes
347 check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"")
348 check(u"\uDC00\uD800", "\"\\udc00\\ud800\"")
349 check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
350 check(u"\uD800\uDC00", "\"\\U00010000\"")
351 check(u"\uD800\uDC01", "\"\\U00010001\"")
352 check(u"\uD801\uDC00", "\"\\U00010400\"")
353 check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"")
354 check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"")
355 check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"")
358 check("\xFF", "b\"\\xff\"")
359 check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
361 def test_quote_output_ascii(self, enc='ascii'):
362 def check(inp, out, optional_quotes=False, quote_newlines=None):
363 self._check(inp, out, enc, optional_quotes, quote_newlines)
365 self._test_quote_output_all(enc)
366 check(u"\u00D7", "\"\\xd7\"")
367 check(u"'\u00D7", "\"'\\xd7\"")
368 check(u"\"\u00D7", "\"\\\"\\xd7\"")
369 check(u"\u2621", "\"\\u2621\"")
370 check(u"'\u2621", "\"'\\u2621\"")
371 check(u"\"\u2621", "\"\\\"\\u2621\"")
372 check(u"\n", "'\n'", True, quote_newlines=False)
373 check(u"\n", "\"\\x0a\"", quote_newlines=True)
375 def test_quote_output_latin1(self, enc='latin1'):
376 def check(inp, out, optional_quotes=False, quote_newlines=None):
377 self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
379 self._test_quote_output_all(enc)
380 check(u"\u00D7", u"'\u00D7'", True)
381 check(u"'\u00D7", u"\"'\u00D7\"")
382 check(u"\"\u00D7", u"'\"\u00D7'")
383 check(u"\u00D7\"", u"'\u00D7\"'", True)
384 check(u"\u2621", u"\"\\u2621\"")
385 check(u"'\u2621", u"\"'\\u2621\"")
386 check(u"\"\u2621", u"\"\\\"\\u2621\"")
387 check(u"\n", u"'\n'", True, quote_newlines=False)
388 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
390 def test_quote_output_utf8(self, enc='utf-8'):
391 def check(inp, out, optional_quotes=False, quote_newlines=None):
392 self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines)
394 self._test_quote_output_all(enc)
395 check(u"\u2621", u"'\u2621'", True)
396 check(u"'\u2621", u"\"'\u2621\"")
397 check(u"\"\u2621", u"'\"\u2621'")
398 check(u"\u2621\"", u"'\u2621\"'", True)
399 check(u"\n", u"'\n'", True, quote_newlines=False)
400 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
402 def test_quote_output_default(self):
403 encodingutil.io_encoding = 'ascii'
404 self.test_quote_output_ascii(None)
406 encodingutil.io_encoding = 'latin1'
407 self.test_quote_output_latin1(None)
409 encodingutil.io_encoding = 'utf-8'
410 self.test_quote_output_utf8(None)
413 class QuotePaths(ReallyEqualMixin, unittest.TestCase):
414 def test_quote_path(self):
415 self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'")
416 self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'")
417 self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar")
418 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"')
419 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"')
420 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"')
422 def win32_other(win32, other):
423 return win32 if sys.platform == "win32" else other
425 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
426 win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
427 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
428 win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
429 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
430 win32_other("C:\\foo", "\\\\?\\C:\\foo"))
431 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
432 win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
433 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
434 win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
435 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
436 win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar"))
439 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
440 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
441 argv = 'lumi\xc3\xa8re'
443 filesystem_encoding = 'UTF-8'
444 io_encoding = 'UTF-8'
445 dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
447 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
448 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
451 filesystem_encoding = 'ISO-8859-1'
452 io_encoding = 'ISO-8859-1'
453 dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
455 class Windows(EncodingUtil, unittest.TestCase):
456 uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
457 argv = 'lumi\xc3\xa8re'
459 filesystem_encoding = 'mbcs'
460 io_encoding = 'utf-8'
461 dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
463 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
464 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
465 output = 'lumi\xc3\xa8re'
467 filesystem_encoding = 'utf-8'
468 io_encoding = 'UTF-8'
469 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
471 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
472 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
474 filesystem_encoding = 'utf-8'
475 io_encoding = 'US-ASCII'
476 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
478 class OpenBSD(EncodingUtil, unittest.TestCase):
479 uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
480 platform = 'openbsd4'
481 filesystem_encoding = '646'
483 # Oops, I cannot write filenames containing non-ascii characters
486 class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
487 def test_to_str(self):
488 self.failUnlessReallyEqual(to_str("foo"), "foo")
489 self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re")
490 self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want?
491 self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re")
492 self.failUnlessReallyEqual(to_str(None), None)
494 def test_from_utf8_or_none(self):
495 self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
496 self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re")
497 self.failUnlessReallyEqual(from_utf8_or_none(None), None)
498 self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF")