2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
12 # The following main helps to generate a test class for other operating
15 if __name__ == "__main__":
21 if len(sys.argv) != 2:
22 print "Usage: %s lumi<e-grave>re" % sys.argv[0]
25 if sys.platform == "win32":
27 from allmydata.windows.fixups import initialize
29 print "set PYTHONPATH to the src directory"
34 print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35 print " uname = '%s'" % ' '.join(platform.uname())
36 print " argv = %s" % repr(sys.argv[1])
37 print " platform = '%s'" % sys.platform
38 print " filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39 print " io_encoding = '%s'" % sys.stdout.encoding
41 tmpdir = tempfile.mkdtemp()
42 for fname in TEST_FILENAMES:
43 open(os.path.join(tmpdir, fname), 'w').close()
45 # Use Unicode API under Windows or MacOS X
46 if sys.platform in ('win32', 'darwin'):
47 dirlist = os.listdir(unicode(tmpdir))
49 dirlist = os.listdir(tmpdir)
51 print " dirlist = %s" % repr(dirlist)
53 print " # Oops, I cannot write filenames containing non-ascii characters"
59 from twisted.trial import unittest
60 from mock import patch
61 import os, sys, locale
63 from allmydata.test.common_util import ReallyEqualMixin
64 from allmydata.util import encodingutil
65 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
66 unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
67 FilenameEncodingError, get_io_encoding, get_filesystem_encoding, _reload
68 from allmydata.dirnode import normalize
70 from twisted.python import usage
72 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
75 def test_get_io_encoding(self, mock_stdout):
76 mock_stdout.encoding = 'UTF-8'
78 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
80 mock_stdout.encoding = 'cp65001'
82 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
84 mock_stdout.encoding = 'koi8-r'
85 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
87 self.failUnlessReallyEqual(get_io_encoding(), expected)
89 mock_stdout.encoding = 'nonexistent_encoding'
90 if sys.platform == "win32":
92 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
94 self.failUnlessRaises(AssertionError, _reload)
96 @patch('locale.getpreferredencoding')
97 def test_get_io_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
98 locale # hush pyflakes
99 mock_locale_getpreferredencoding.return_value = 'koi8-r'
103 old_stdout = sys.stdout
104 sys.stdout = DummyStdout()
106 expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
108 self.failUnlessReallyEqual(get_io_encoding(), expected)
110 sys.stdout.encoding = None
112 self.failUnlessReallyEqual(get_io_encoding(), expected)
114 mock_locale_getpreferredencoding.return_value = None
116 self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
118 sys.stdout = old_stdout
120 def test_argv_to_unicode(self):
121 encodingutil.io_encoding = 'utf-8'
122 self.failUnlessRaises(usage.UsageError,
124 lumiere_nfc.encode('latin1'))
126 def test_unicode_to_output(self):
127 encodingutil.io_encoding = 'koi8-r'
128 self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
131 def test_no_unicode_normalization(self, mock):
132 # Pretend to run on a Unicode platform.
133 # We normalized to NFC in 1.7beta, but we now don't.
134 orig_platform = sys.platform
136 sys.platform = 'darwin'
137 mock.return_value = [Artonwall_nfd]
139 self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
141 sys.platform = orig_platform
143 # The following tests apply only to platforms that don't store filenames as
144 # Unicode entities on the filesystem.
145 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
147 # Mock sys.platform because unicode_platform() uses it
148 self.original_platform = sys.platform
149 sys.platform = 'linux'
152 sys.platform = self.original_platform
155 @patch('sys.getfilesystemencoding')
157 def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
158 # What happens if latin1-encoded filenames are encountered on an UTF-8
160 mock_listdir.return_value = [
161 lumiere_nfc.encode('utf-8'),
162 lumiere_nfc.encode('latin1')]
164 mock_getfilesystemencoding.return_value = 'utf-8'
166 self.failUnlessRaises(FilenameEncodingError,
170 # We're trying to list a directory whose name cannot be represented in
171 # the filesystem encoding. This should fail.
172 mock_getfilesystemencoding.return_value = 'ascii'
174 self.failUnlessRaises(FilenameEncodingError,
179 class EncodingUtil(ReallyEqualMixin):
181 self.original_platform = sys.platform
182 sys.platform = self.platform
185 sys.platform = self.original_platform
189 def test_argv_to_unicode(self, mock):
190 if 'argv' not in dir(self):
193 mock.encoding = self.io_encoding
197 self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
199 def test_unicode_to_url(self):
200 self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
203 def test_unicode_to_output(self, mock):
204 if 'argv' not in dir(self):
207 mock.encoding = self.io_encoding
209 self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
211 def test_unicode_platform(self):
221 self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
223 @patch('sys.getfilesystemencoding')
225 def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
226 if 'dirlist' not in dir(self):
230 u"test".encode(self.filesystem_encoding)
231 except (LookupError, AttributeError):
232 raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
233 "that we are testing for the benefit of a different platform."
234 % (self.filesystem_encoding,))
236 mock_listdir.return_value = self.dirlist
237 mock_getfilesystemencoding.return_value = self.filesystem_encoding
240 filenames = listdir_unicode(u'/dummy')
242 self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
246 class StdlibUnicode(unittest.TestCase):
247 """This mainly tests that some of the stdlib functions support Unicode paths, but also that
248 listdir_unicode works for valid filenames."""
250 def skip_if_cannot_represent_filename(self, u):
251 enc = get_filesystem_encoding()
252 if not unicode_platform():
255 except UnicodeEncodeError:
256 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
258 def test_mkdir_open_exists_abspath_listdir_expanduser(self):
259 self.skip_if_cannot_represent_filename(lumiere_nfc)
262 os.mkdir(lumiere_nfc)
263 except EnvironmentError, e:
264 raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
265 "does not support Unicode, even though the platform does." % (e,))
267 fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
268 open(fn, 'wb').close()
269 self.failUnless(os.path.exists(fn))
270 self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
271 filenames = listdir_unicode(lumiere_nfc)
273 # We only require that the listing includes a filename that is canonically equivalent
274 # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
275 self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
277 expanded = os.path.expanduser("~/" + lumiere_nfc)
278 self.failIfIn("~", expanded)
279 self.failUnless(expanded.endswith(lumiere_nfc), expanded)
281 def test_open_unrepresentable(self):
282 if unicode_platform():
283 raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
285 enc = get_filesystem_encoding()
289 raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
290 except UnicodeEncodeError:
291 self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
294 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
298 def _check(self, inp, out, enc, optional_quotes, quote_newlines):
302 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
303 self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
306 elif isinstance(inp, str):
307 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
308 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
310 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
311 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
313 def _test_quote_output_all(self, enc):
314 def check(inp, out, optional_quotes=False, quote_newlines=None):
315 self._check(inp, out, enc, optional_quotes, quote_newlines)
317 # optional single quotes
318 check("foo", "'foo'", True)
319 check("\\", "'\\'", True)
320 check("$\"`", "'$\"`'", True)
321 check("\n", "'\n'", True, quote_newlines=False)
323 # mandatory single quotes
328 check("\n", "\"\\x0a\"", quote_newlines=True)
329 check("\x00", "\"\\x00\"")
331 # invalid Unicode and astral planes
332 check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"")
333 check(u"\uDC00\uD800", "\"\\udc00\\ud800\"")
334 check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
335 check(u"\uD800\uDC00", "\"\\U00010000\"")
336 check(u"\uD800\uDC01", "\"\\U00010001\"")
337 check(u"\uD801\uDC00", "\"\\U00010400\"")
338 check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"")
339 check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"")
340 check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"")
343 check("\xFF", "b\"\\xff\"")
344 check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
346 def test_quote_output_ascii(self, enc='ascii'):
347 def check(inp, out, optional_quotes=False, quote_newlines=None):
348 self._check(inp, out, enc, optional_quotes, quote_newlines)
350 self._test_quote_output_all(enc)
351 check(u"\u00D7", "\"\\xd7\"")
352 check(u"'\u00D7", "\"'\\xd7\"")
353 check(u"\"\u00D7", "\"\\\"\\xd7\"")
354 check(u"\u2621", "\"\\u2621\"")
355 check(u"'\u2621", "\"'\\u2621\"")
356 check(u"\"\u2621", "\"\\\"\\u2621\"")
357 check(u"\n", "'\n'", True, quote_newlines=False)
358 check(u"\n", "\"\\x0a\"", quote_newlines=True)
360 def test_quote_output_latin1(self, enc='latin1'):
361 def check(inp, out, optional_quotes=False, quote_newlines=None):
362 self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
364 self._test_quote_output_all(enc)
365 check(u"\u00D7", u"'\u00D7'", True)
366 check(u"'\u00D7", u"\"'\u00D7\"")
367 check(u"\"\u00D7", u"'\"\u00D7'")
368 check(u"\u00D7\"", u"'\u00D7\"'", True)
369 check(u"\u2621", u"\"\\u2621\"")
370 check(u"'\u2621", u"\"'\\u2621\"")
371 check(u"\"\u2621", u"\"\\\"\\u2621\"")
372 check(u"\n", u"'\n'", True, quote_newlines=False)
373 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
375 def test_quote_output_utf8(self, enc='utf-8'):
376 def check(inp, out, optional_quotes=False, quote_newlines=None):
377 self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines)
379 self._test_quote_output_all(enc)
380 check(u"\u2621", u"'\u2621'", True)
381 check(u"'\u2621", u"\"'\u2621\"")
382 check(u"\"\u2621", u"'\"\u2621'")
383 check(u"\u2621\"", u"'\u2621\"'", True)
384 check(u"\n", u"'\n'", True, quote_newlines=False)
385 check(u"\n", u"\"\\x0a\"", quote_newlines=True)
387 def test_quote_output_default(self):
388 encodingutil.io_encoding = 'ascii'
389 self.test_quote_output_ascii(None)
391 encodingutil.io_encoding = 'latin1'
392 self.test_quote_output_latin1(None)
394 encodingutil.io_encoding = 'utf-8'
395 self.test_quote_output_utf8(None)
398 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
399 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
400 argv = 'lumi\xc3\xa8re'
402 filesystem_encoding = 'UTF-8'
403 io_encoding = 'UTF-8'
404 dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
406 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
407 uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
410 filesystem_encoding = 'ISO-8859-1'
411 io_encoding = 'ISO-8859-1'
412 dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
414 class Windows(EncodingUtil, unittest.TestCase):
415 uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
416 argv = 'lumi\xc3\xa8re'
418 filesystem_encoding = 'mbcs'
419 io_encoding = 'utf-8'
420 dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
422 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
423 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
424 output = 'lumi\xc3\xa8re'
426 filesystem_encoding = 'utf-8'
427 io_encoding = 'UTF-8'
428 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
430 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
431 uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
433 filesystem_encoding = 'utf-8'
434 io_encoding = 'US-ASCII'
435 dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
437 class OpenBSD(EncodingUtil, unittest.TestCase):
438 uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
439 platform = 'openbsd4'
440 filesystem_encoding = '646'
442 # Oops, I cannot write filenames containing non-ascii characters