2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
12 # The following main helps to generate a test class for other operating
15 if __name__ == "__main__":
21 if len(sys.argv) != 2:
22 print "Usage: %s lumi<e-grave>re" % sys.argv[0]
25 if sys.platform == "win32":
27 from allmydata.windows.fixups import initialize
29 print "set PYTHONPATH to the src directory"
34 print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35 print " uname = '%s'" % ' '.join(platform.uname())
36 print " argv = %s" % repr(sys.argv[1])
37 print " platform = '%s'" % sys.platform
38 print " filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39 print " io_encoding = '%s'" % sys.stdout.encoding
41 tmpdir = tempfile.mkdtemp()
42 for fname in TEST_FILENAMES:
43 open(os.path.join(tmpdir, fname), 'w').close()
45 # Use Unicode API under Windows or MacOS X
46 if sys.platform in ('win32', 'darwin'):
47 dirlist = os.listdir(unicode(tmpdir))
49 dirlist = os.listdir(tmpdir)
51 print " dirlist = %s" % repr(dirlist)
53 print " # Oops, I cannot write filenames containing non-ascii characters"
60 import os, sys, locale
62 from twisted.trial import unittest
64 from twisted.python.filepath import FilePath
66 from allmydata.test.common_util import ReallyEqualMixin
67 from allmydata.util import encodingutil, fileutil
68 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
69 unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
70 quote_filepath, unicode_platform, listdir_unicode, FilenameEncodingError, \
71 get_io_encoding, get_filesystem_encoding, to_str, from_utf8_or_none, _reload, \
72 to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from
73 from allmydata.dirnode import normalize
75 from twisted.python import usage
78 class MockStdout(object):
81 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
82 def test_get_io_encoding(self):
83 mock_stdout = MockStdout()
84 self.patch(sys, 'stdout', mock_stdout)
86 mock_stdout.encoding = 'UTF-8'
88 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
90 mock_stdout.encoding = 'cp65001'
92 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
94 mock_stdout.encoding = 'koi8-r'
95 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
97 self.failUnlessReallyEqual(get_io_encoding(), expected)
99 mock_stdout.encoding = 'nonexistent_encoding'
100 if sys.platform == "win32":
102 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
104 self.failUnlessRaises(AssertionError, _reload)
106 def test_get_io_encoding_not_from_stdout(self):
107 preferredencoding = 'koi8-r'
108 def call_locale_getpreferredencoding():
109 return preferredencoding
110 self.patch(locale, 'getpreferredencoding', call_locale_getpreferredencoding)
111 mock_stdout = MockStdout()
112 self.patch(sys, 'stdout', mock_stdout)
114 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
116 self.failUnlessReallyEqual(get_io_encoding(), expected)
118 mock_stdout.encoding = None
120 self.failUnlessReallyEqual(get_io_encoding(), expected)
122 preferredencoding = None
124 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
126 def test_argv_to_unicode(self):
127 encodingutil.io_encoding = 'utf-8'
128 self.failUnlessRaises(usage.UsageError,
130 lumiere_nfc.encode('latin1'))
132 def test_unicode_to_output(self):
133 encodingutil.io_encoding = 'koi8-r'
134 self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
136 def test_no_unicode_normalization(self):
137 # Pretend to run on a Unicode platform.
138 # listdir_unicode normalized to NFC in 1.7beta, but now doesn't.
140 def call_os_listdir(path):
141 return [Artonwall_nfd]
142 self.patch(os, 'listdir', call_os_listdir)
143 self.patch(sys, 'platform', 'darwin')
146 self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
149 # The following tests apply only to platforms that don't store filenames as
150 # Unicode entities on the filesystem.
151 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
153 # Mock sys.platform because unicode_platform() uses it
154 self.original_platform = sys.platform
155 sys.platform = 'linux'
158 sys.platform = self.original_platform
161 def test_listdir_unicode(self):
162 # What happens if latin1-encoded filenames are encountered on an UTF-8
164 def call_os_listdir(path):
166 lumiere_nfc.encode('utf-8'),
167 lumiere_nfc.encode('latin1')
169 self.patch(os, 'listdir', call_os_listdir)
171 sys_filesystemencoding = 'utf-8'
172 def call_sys_getfilesystemencoding():
173 return sys_filesystemencoding
174 self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding)
177 self.failUnlessRaises(FilenameEncodingError,
181 # We're trying to list a directory whose name cannot be represented in
182 # the filesystem encoding. This should fail.
183 sys_filesystemencoding = 'ascii'
185 self.failUnlessRaises(FilenameEncodingError,
190 class EncodingUtil(ReallyEqualMixin):
192 self.original_platform = sys.platform
193 sys.platform = self.platform
196 sys.platform = self.original_platform
199 def test_argv_to_unicode(self):
200 if 'argv' not in dir(self):
203 mock_stdout = MockStdout()
204 mock_stdout.encoding = self.io_encoding
205 self.patch(sys, 'stdout', mock_stdout)
210 self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
212 def test_unicode_to_url(self):
213 self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
215 def test_unicode_to_output(self):
216 if 'argv' not in dir(self):
219 mock_stdout = MockStdout()
220 mock_stdout.encoding = self.io_encoding
221 self.patch(sys, 'stdout', mock_stdout)
224 self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
226 def test_unicode_platform(self):
236 self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
238 def test_listdir_unicode(self):
239 if 'dirlist' not in dir(self):
243 u"test".encode(self.filesystem_encoding)
244 except (LookupError, AttributeError):
245 raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
246 "that we are testing for the benefit of a different platform."
247 % (self.filesystem_encoding,))
249 def call_os_listdir(path):
251 self.patch(os, 'listdir', call_os_listdir)
253 def call_sys_getfilesystemencoding():
254 return self.filesystem_encoding
255 self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding)
258 filenames = listdir_unicode(u'/dummy')
260 self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
264 class StdlibUnicode(unittest.TestCase):
265 """This mainly tests that some of the stdlib functions support Unicode paths, but also that
266 listdir_unicode works for valid filenames."""
268 def skip_if_cannot_represent_filename(self, u):
269 enc = get_filesystem_encoding()
270 if not unicode_platform():
273 except UnicodeEncodeError:
274 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
276 def test_mkdir_open_exists_abspath_listdir_expanduser(self):
277 self.skip_if_cannot_represent_filename(lumiere_nfc)
280 os.mkdir(lumiere_nfc)
281 except EnvironmentError, e:
282 raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
283 "does not support Unicode, even though the platform does." % (e,))
285 fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
286 open(fn, 'wb').close()
287 self.failUnless(os.path.exists(fn))
288 self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
289 filenames = listdir_unicode(lumiere_nfc)
291 # We only require that the listing includes a filename that is canonically equivalent
292 # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
293 self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
295 expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
296 self.failIfIn(u"~", expanded)
297 self.failUnless(expanded.endswith(lumiere_nfc), expanded)
299 def test_open_unrepresentable(self):
300 if unicode_platform():
301 raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
303 enc = get_filesystem_encoding()
307 raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
308 except UnicodeEncodeError:
309 self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
312 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
316 def _check(self, inp, out, enc, optional_quotes, quote_newlines):
320 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
321 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
324 elif isinstance(inp, str):
325 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
326 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
328 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
329 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
331 def _test_quote_output_all(self, enc):
332 def check(inp, out, optional_quotes=False, quote_newlines=None):
333 self._check(inp, out, enc, optional_quotes, quote_newlines)
335 # optional single quotes
336 check("foo", "'foo'", True)
337 check("\\", "'\\'", True)
338 check("$\"`", "'$\"`'", True)
339 check("\n", "'\n'", True, quote_newlines=False)
341 # mandatory single quotes
346 check("\n", "\"\\x0a\"", quote_newlines=True)
347 check("\x00", "\"\\x00\"")
349 # invalid Unicode and astral planes
350 check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"")
351 check(u"\uDC00\uD800", "\"\\udc00\\ud800\"")
352 check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
353 check(u"\uD800\uDC00", "\"\\U00010000\"")
354 check(u"\uD800\uDC01", "\"\\U00010001\"")
355 check(u"\uD801\uDC00", "\"\\U00010400\"")
356 check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"")
357 check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"")
358 check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"")
361 check("\xFF", "b\"\\xff\"")
362 check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
364 def test_quote_output_ascii(self, enc='ascii'):
365 def check(inp, out, optional_quotes=False, quote_newlines=None):
366 self._check(inp, out, enc, optional_quotes, quote_newlines)
368 self._test_quote_output_all(enc)
369 check(u"\u00D7", "\"\\xd7\"")
370 check(u"'\u00D7", "\"'\\xd7\"")
371 check(u"\"\u00D7", "\"\\\"\\xd7\"")
372 check(u"\u2621", "\"\\u2621\"")
373 check(u"'\u2621", "\"'\\u2621\"")
374 check(u"\"\u2621", "\"\\\"\\u2621\"")
375 check(u"\n", "'\n'", True, quote_newlines=False)
376 check(u"\n", "\"\\x0a\"", quote_newlines=True)
378 def test_quote_output_latin1(self, enc='latin1'):
379 def check(inp, out, optional_quotes=False, quote_newlines=None):
380 self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
382 self._test_quote_output_all(enc)
383 check(u"\u00D7", u"'\u00D7'", True)
384 check(u"'\u00D7", u"\"'\u00D7\"")
385 check(u"\"\u00D7", u"'\"\u00D7'")
386 check(u"\u00D7\"", u"'\u00D7\"'", True)
387 check(u"\u2621", u"\"\\u2621\"")
388 check(u"'\u2621", u"\"'\\u2621\"")
389 check(u"\"\u2621", u"\"\\\"\\u2621\"")
390 check(u"\n", u"'\n'", True, quote_newlines=False)
391 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
393 def test_quote_output_utf8(self, enc='utf-8'):
394 def check(inp, out, optional_quotes=False, quote_newlines=None):
395 self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines)
397 self._test_quote_output_all(enc)
398 check(u"\u2621", u"'\u2621'", True)
399 check(u"'\u2621", u"\"'\u2621\"")
400 check(u"\"\u2621", u"'\"\u2621'")
401 check(u"\u2621\"", u"'\u2621\"'", True)
402 check(u"\n", u"'\n'", True, quote_newlines=False)
403 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
405 def test_quote_output_default(self):
406 self.patch(encodingutil, 'io_encoding', 'ascii')
407 self.test_quote_output_ascii(None)
409 self.patch(encodingutil, 'io_encoding', 'latin1')
410 self.test_quote_output_latin1(None)
412 self.patch(encodingutil, 'io_encoding', 'utf-8')
413 self.test_quote_output_utf8(None)
416 def win32_other(win32, other):
417 return win32 if sys.platform == "win32" else other
419 class QuotePaths(ReallyEqualMixin, unittest.TestCase):
420 def test_quote_path(self):
421 self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'")
422 self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'")
423 self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar")
424 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"')
425 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"')
426 self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"')
428 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
429 win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
430 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
431 win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
432 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
433 win32_other("C:\\foo", "\\\\?\\C:\\foo"))
434 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
435 win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
436 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
437 win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
438 self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
439 win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar"))
441 def test_quote_filepath(self):
442 foo_bar_fp = FilePath(win32_other(u'C:\\foo\\bar', u'/foo/bar'))
443 self.failUnlessReallyEqual(quote_filepath(foo_bar_fp),
444 win32_other("'C:\\foo\\bar'", "'/foo/bar'"))
445 self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=True),
446 win32_other("'C:\\foo\\bar'", "'/foo/bar'"))
447 self.failUnlessReallyEqual(quote_filepath(foo_bar_fp, quotemarks=False),
448 win32_other("C:\\foo\\bar", "/foo/bar"))
450 if sys.platform == "win32":
451 foo_longfp = FilePath(u'\\\\?\\C:\\foo')
452 self.failUnlessReallyEqual(quote_filepath(foo_longfp),
454 self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=True),
456 self.failUnlessReallyEqual(quote_filepath(foo_longfp, quotemarks=False),
460 class FilePaths(ReallyEqualMixin, unittest.TestCase):
461 def test_to_filepath(self):
462 foo_u = win32_other(u'C:\\foo', u'/foo')
464 nosep_fp = to_filepath(foo_u)
465 sep_fp = to_filepath(foo_u + os.path.sep)
467 for fp in (nosep_fp, sep_fp):
468 self.failUnlessReallyEqual(fp, FilePath(foo_u))
469 if encodingutil.use_unicode_filepath:
470 self.failUnlessReallyEqual(fp.path, foo_u)
472 if sys.platform == "win32":
473 long_u = u'\\\\?\\C:\\foo'
474 longfp = to_filepath(long_u + u'\\')
475 self.failUnlessReallyEqual(longfp, FilePath(long_u))
476 self.failUnlessReallyEqual(longfp.path, long_u)
478 def test_extend_filepath(self):
479 foo_bfp = FilePath(win32_other(b'C:\\foo', b'/foo'))
480 foo_ufp = FilePath(win32_other(u'C:\\foo', u'/foo'))
481 foo_bar_baz_u = win32_other(u'C:\\foo\\bar\\baz', u'/foo/bar/baz')
483 for foo_fp in (foo_bfp, foo_ufp):
484 fp = extend_filepath(foo_fp, [u'bar', u'baz'])
485 self.failUnlessReallyEqual(fp, FilePath(foo_bar_baz_u))
486 if encodingutil.use_unicode_filepath:
487 self.failUnlessReallyEqual(fp.path, foo_bar_baz_u)
489 def test_unicode_from_filepath(self):
490 foo_bfp = FilePath(win32_other(b'C:\\foo', b'/foo'))
491 foo_ufp = FilePath(win32_other(u'C:\\foo', u'/foo'))
492 foo_u = win32_other(u'C:\\foo', u'/foo')
494 for foo_fp in (foo_bfp, foo_ufp):
495 self.failUnlessReallyEqual(unicode_from_filepath(foo_fp), foo_u)
497 def test_unicode_segments_from(self):
498 foo_bfp = FilePath(win32_other(b'C:\\foo', b'/foo'))
499 foo_ufp = FilePath(win32_other(u'C:\\foo', u'/foo'))
500 foo_bar_baz_bfp = FilePath(win32_other(b'C:\\foo\\bar\\baz', b'/foo/bar/baz'))
501 foo_bar_baz_ufp = FilePath(win32_other(u'C:\\foo\\bar\\baz', u'/foo/bar/baz'))
503 for foo_fp in (foo_bfp, foo_ufp):
504 for foo_bar_baz_fp in (foo_bar_baz_bfp, foo_bar_baz_ufp):
505 self.failUnlessReallyEqual(unicode_segments_from(foo_bar_baz_fp, foo_fp),
509 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
510 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
511 argv = 'lumi\xc3\xa8re'
513 filesystem_encoding = 'UTF-8'
514 io_encoding = 'UTF-8'
515 dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
517 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
518 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
521 filesystem_encoding = 'ISO-8859-1'
522 io_encoding = 'ISO-8859-1'
523 dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
525 class Windows(EncodingUtil, unittest.TestCase):
526 uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
527 argv = 'lumi\xc3\xa8re'
529 filesystem_encoding = 'mbcs'
530 io_encoding = 'utf-8'
531 dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
533 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
534 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
535 output = 'lumi\xc3\xa8re'
537 filesystem_encoding = 'utf-8'
538 io_encoding = 'UTF-8'
539 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
541 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
542 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
544 filesystem_encoding = 'utf-8'
545 io_encoding = 'US-ASCII'
546 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
548 class OpenBSD(EncodingUtil, unittest.TestCase):
549 uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
550 platform = 'openbsd4'
551 filesystem_encoding = '646'
553 # Oops, I cannot write filenames containing non-ascii characters
556 class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
557 def test_to_str(self):
558 self.failUnlessReallyEqual(to_str("foo"), "foo")
559 self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re")
560 self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want?
561 self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re")
562 self.failUnlessReallyEqual(to_str(None), None)
564 def test_from_utf8_or_none(self):
565 self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
566 self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re")
567 self.failUnlessReallyEqual(from_utf8_or_none(None), None)
568 self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF")